## **dawnload Question type dataset**

In [1]:
# mount the drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd

file_path = '/content/drive/My Drive/NLP_project/message_types.csv'
dataset = pd.read_csv(file_path)

# Print the first few rows
dataset.head()

Unnamed: 0,text,type
0,do iran and afghanistan speak the same language,yn
1,do good samaritan laws protect those who help ...,yn
2,is windows movie maker part of windows essentials,yn
3,is confectionary sugar the same as powdered sugar,yn
4,is elder scrolls online the same as skyrim,yn


In [3]:
# Display the structure and summary of the dataset
print(dataset.info())
print(dataset.describe())

# Check for missing values in the dataset
print(dataset.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153079 entries, 0 to 153078
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    153079 non-null  object
 1   type    153079 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB
None
                                        text    type
count                                 153079  153079
unique                                152959       4
top     In what year did World War II start?      wh
freq                                       3  140329
text    0
type    0
dtype: int64


## **heuristic functions**

In [4]:
#heuristic functions to classify the text based on simple keywords
def label_question_type(text):
    text = text.lower()
    if text.startswith(('is', 'are', 'can', 'should', 'do', 'does', 'have', 'has')):
        return 'yn'
    elif any(word in text for word in ['what', 'where', 'when', 'why', 'how']):
        return 'wh'
    elif any(word in text for word in ['tell', 'ask', 'request', 'remind']):
        return 'req'
    else:
        return 'stmt'

# Apply the heuristic function to a new column
dataset['predicted_type'] = dataset['text'].apply(label_question_type)

# Print the first few rows to see the heuristic labeling
print(dataset.head())

                                                text type predicted_type
0    do iran and afghanistan speak the same language   yn             yn
1  do good samaritan laws protect those who help ...   yn             yn
2  is windows movie maker part of windows essentials   yn             yn
3  is confectionary sugar the same as powdered sugar   yn             yn
4         is elder scrolls online the same as skyrim   yn             yn


## **Logistic Regression model**

In [6]:
# build a Logistic Regression model to predict the question types
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib

# Split the data into features and target variable
X = dataset['text']
y = dataset['type']

# Convert text data into numerical data using CountVectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2))
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000, C=0.5)
model.fit(X_train, y_train)

# Predict on the testing set
y_pred = model.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save the model
model_path = '/content/drive/My Drive/NLP_Project/logistic_regression_model.joblib'
joblib.dump(model, model_path)

# Save the vectorizer
vectorizer_path = '/content/drive/My Drive/NLP_Project/count_vectorizer.joblib'
joblib.dump(vectorizer, vectorizer_path)


Accuracy: 0.9987914815782597
              precision    recall  f1-score   support

         req       1.00      0.20      0.33         5
        stmt       0.50      0.17      0.25         6
          wh       1.00      1.00      1.00     28154
          yn       1.00      0.99      0.99      2451

    accuracy                           1.00     30616
   macro avg       0.87      0.59      0.64     30616
weighted avg       1.00      1.00      1.00     30616

Confusion Matrix:
 [[    1     0     1     3]
 [    0     1     0     5]
 [    0     0 28150     4]
 [    0     1    23  2427]]


['/content/drive/My Drive/NLP_Project/count_vectorizer.joblib']

# **Dawnload open-orca dataset**

In [8]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-a

In [9]:
#dawnlad the data

from datasets import load_dataset
dataset = load_dataset("Open-Orca/OpenOrca")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/12.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
# Select the first 2000 instances(our dataset have only train split)
subset = dataset['train'].select(range(2000))

# Extract only the "question" and "response" columns
questions_responses = subset.to_pandas()[['question', 'response']]

# Drop the first row(the descreption raw)
questions_responses = questions_responses.drop(questions_responses.index[0]).reset_index(drop=True)

questions_responses.head(5)

Unnamed: 0,question,response
0,Generate an approximately fifteen-word sentenc...,Midsummer House is a moderately priced Chinese...
1,What happens next in this paragraph?\n\nShe th...,C. She then dips the needle in ink and using t...
2,Please answer the following question: I want t...,"Based on the passage, discuss the primary moti..."
3,James runs a TV show and there are 5 main char...,"James pays the minor characters $15,000 each e..."
4,"Given the stream of consciousness rationale, p...",Question: What is the proper technique for a f...


## **Distanse supervision (lable the data)**

In [11]:
import pandas as pd
import joblib

questions = questions_responses['question']

# Load the saved vectorizer and model
vectorizer_path = '/content/drive/My Drive/NLP_Project/count_vectorizer.joblib'
model_path = '/content/drive/My Drive/NLP_Project/logistic_regression_model.joblib'

vectorizer = joblib.load(vectorizer_path)
model = joblib.load(model_path)

# Transform the new questions using the loaded vectorizer
X = vectorizer.transform(questions)

# Predict the types using our logistic regression model
predicted_types = model.predict(X)

# Add predictions as a new column in the original dataset
questions_responses['type'] = predicted_types

# Save the dataset in Google Drive
dataset_path = '/content/drive/My Drive/NLP_Project/questions_responses.csv'
questions_responses.to_csv(dataset_path, index=False)

# Display the updated DataFrame
questions_responses.head()


Unnamed: 0,question,response,type
0,Generate an approximately fifteen-word sentenc...,Midsummer House is a moderately priced Chinese...,yn
1,What happens next in this paragraph?\n\nShe th...,C. She then dips the needle in ink and using t...,wh
2,Please answer the following question: I want t...,"Based on the passage, discuss the primary moti...",wh
3,James runs a TV show and there are 5 main char...,"James pays the minor characters $15,000 each e...",wh
4,"Given the stream of consciousness rationale, p...",Question: What is the proper technique for a f...,wh
