In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
genesis_ic = wn.ic(genesis, False, 0.0)

import numpy as np
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler as MMS

In [2]:
data = pd.read_csv('train2.csv')
data.drop('Id', axis=1 , inplace=True)
data.head()
#This data isn't extremely good — it's just reddit comments that were labeled with a specific topic
#There might be some noise to the data

Unnamed: 0,Comment,Topic
0,A few things. You might have negative- frequen...,Biology
1,Is it so hard to believe that there exist part...,Physics
2,There are bees,Biology
3,I'm a medication technician. And that's alot o...,Biology
4,Cesium is such a pretty metal.,Chemistry


In [3]:
data.shape

(8695, 2)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
#intializing the TfidVectorizer object... helps vectorize and normalize the data using a logarithmic function 
CommentsToBeTokenized = data['Comment'] 
tfidf_result=tfidf.fit_transform(CommentsToBeTokenized).toarray()
#transforming each comment into an n-dimensional vector, making a matrix out of all of the comments  
tfidf_result #most of these are 0 because the word doesn't appear in the sparse matrix 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:
tfidf_result.shape #18177 unique strings 

(8695, 18177)

In [6]:
vectorizer = TfidfVectorizer(max_features= 6000)
#changing the TfidVectorizer object... this is because I did some hyperparameter testing with max features 

In [7]:
tfidf_result = vectorizer.fit_transform(CommentsToBeTokenized).toarray()
tfidf_result #sparse matrix of size (8695, 6000) 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
tfidf_result = vectorizer.fit_transform(CommentsToBeTokenized).toarray()
features = vectorizer.get_feature_names_out()
tfidf_result = pd.DataFrame(tfidf_result, columns=features)
tfidf_result.head()
#There are some random collections of characters, but that's expected given our quality of data

Unnamed: 0,000,01,019,02,020,021,03,04,07,09,...,yours,yourself,youtu,youtube,yt,yup,zeolites,zero,zinc,zp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
x_train, x_test, y_train, y_test = train_test_split(tfidf_result, data['Topic'], test_size= 0.2, random_state = 110)

In [10]:
for k in range(1,10):
    model = KNC(n_neighbors= k )
    model.fit(x_train, y_train)
    y_predicted = model.predict(x_test)
    print("\nTesting Results:\n")
    print(classification_report(y_test, y_predicted))


Testing Results:

              precision    recall  f1-score   support

     Biology       0.85      0.35      0.50       730
   Chemistry       0.64      0.16      0.25       571
     Physics       0.31      0.91      0.46       438

    accuracy                           0.43      1739
   macro avg       0.60      0.47      0.40      1739
weighted avg       0.64      0.43      0.41      1739


Testing Results:

              precision    recall  f1-score   support

     Biology       0.45      0.93      0.61       730
   Chemistry       0.60      0.15      0.23       571
     Physics       0.60      0.12      0.20       438

    accuracy                           0.47      1739
   macro avg       0.55      0.40      0.35      1739
weighted avg       0.54      0.47      0.38      1739


Testing Results:

              precision    recall  f1-score   support

     Biology       0.70      0.36      0.48       730
   Chemistry       0.53      0.06      0.11       571
     Physics      

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB(alpha = 1) 
#again, this is from hyperparameter testing 

In [12]:
NB.fit(x_train, y_train)

MultinomialNB(alpha=1)

In [13]:
ypred1 = NB.predict(x_train)
ypred = NB.predict(x_test)

In [14]:
ypred.shape

(1739,)

In [15]:
ytrain = y_train[0:1739]
ytest = y_test[0:1739]
ypred1 = ypred1[0:1739]
ypred = ypred[0:1739]

In [16]:
print("Training Results:\n")
print(classification_report(ytrain, ypred1))
print("\nTesting Results:\n")
print(classification_report(ytest, ypred))

Training Results:

              precision    recall  f1-score   support

     Biology       0.73      0.93      0.82       723
   Chemistry       0.82      0.76      0.79       580
     Physics       0.95      0.60      0.74       436

    accuracy                           0.79      1739
   macro avg       0.83      0.77      0.78      1739
weighted avg       0.81      0.79      0.79      1739


Testing Results:

              precision    recall  f1-score   support

     Biology       0.64      0.89      0.75       730
   Chemistry       0.71      0.62      0.66       571
     Physics       0.89      0.46      0.61       438

    accuracy                           0.70      1739
   macro avg       0.75      0.66      0.67      1739
weighted avg       0.73      0.70      0.69      1739



In [17]:
from sklearn.ensemble import RandomForestClassifier 

In [18]:
clf = RandomForestClassifier(max_depth = 9)

In [19]:
clf.fit(x_train, y_train)

RandomForestClassifier(max_depth=9)

In [20]:
ypred1 = clf.predict(x_train)
ypred = clf.predict(x_test)

In [21]:
ytrain = y_train[0:1739]
ytest = y_test[0:1739]
ypred1 = ypred1[0:1739]
ypred = ypred[0:1739]

In [22]:
print("Training Results:\n")
print(classification_report(ytrain, ypred1))
print("\nTesting Results:\n")
print(classification_report(ytest, ypred))
#overfitting on the chemistry and physics dataset... might not be good for predicting biology

Training Results:

              precision    recall  f1-score   support

     Biology       0.47      1.00      0.64       723
   Chemistry       0.98      0.22      0.36       580
     Physics       1.00      0.14      0.25       436

    accuracy                           0.52      1739
   macro avg       0.81      0.45      0.41      1739
weighted avg       0.77      0.52      0.45      1739


Testing Results:

              precision    recall  f1-score   support

     Biology       0.45      0.99      0.62       730
   Chemistry       0.85      0.13      0.23       571
     Physics       0.96      0.11      0.20       438

    accuracy                           0.49      1739
   macro avg       0.76      0.41      0.35      1739
weighted avg       0.71      0.49      0.39      1739



In [23]:
#We probably want to use multinomialNB to perform predictions 
def online_test(question, right_label):
    # convert the question to a vector, using an already existing vector.
    # There is a subtle issue here - if question includes a word that
    # isn't already in the dataset, the transformer/vectorizor won't handle
    # this.
    #
    # This can be solved, but with careful consideration. Read about online 
    # feature extraction for text for some solutions. 
    #
    # Online tf-idf example https://github.com/idoshlomo/online_vectorizers
    tfidf = TfidfVectorizer()
    tfidf_result=tfidf.fit_transform(CommentsToBeTokenized).toarray()

    vectorizer = TfidfVectorizer(max_features=6000)
    tfidf_result = vectorizer.fit_transform(CommentsToBeTokenized).toarray()
    features = vectorizer.get_feature_names_out()
    tfidf_result = pd.DataFrame(tfidf_result, columns=features)
    
    question_as_sparse_matrix = vectorizer.transform([question])
    question_as_vector = np.array(question_as_sparse_matrix.todense())

    # creates a new model 
    NB= MultinomialNB()

    xtrain, xtest, ytrain, ytest = train_test_split(tfidf_result, data['Topic'], test_size= 0.2)
    NB.fit(xtrain, ytrain)

    ypred1 = NB.predict(xtrain)
    ypred = NB.predict(xtest)

    ytrain=ytrain[0:1739]
    ytest=ytest[0:1739]
    ypred1 = ypred1[0:1739]
    ypred = ypred[0:1739]

    # predict the label
    prediction = NB.predict(question_as_vector)
    results = {'Comment': question, 'Topic': right_label, 'Guess' : prediction[0]}
    print(results) 
    # check if the prediction is correct
    if prediction[0] == right_label:
        return "Correct"
    else:
        return "Incorrect"

In [24]:
online_test("What is molecular orbital theory?", "Chemistry")

{'Comment': 'What is molecular orbital theory?', 'Topic': 'Chemistry', 'Guess': 'Chemistry'}


'Correct'

In [25]:
online_test("What types of molecules are aromatic? What about non-aromatic?", "Chemistry")

{'Comment': 'What types of molecules are aromatic? What about non-aromatic?', 'Topic': 'Chemistry', 'Guess': 'Chemistry'}


'Correct'

In [28]:
from online_vectorizers import OnlineTfidfVectorizer
online_tfidf = OnlineTfidfVectorizer()  


In [10]:
KNC().get_params().keys()
a = [3, 4, 5, 6, 7]  
    
KNC().get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [17]:
a= ['uniform', 'distance']
a = []  
for i in range (4, 8): 
    a.append(i * 5)
gs_knn = GridSearchCV (KNC(), param_grid = {'leaf_size': a}, scoring = 'accuracy', cv = 5)
gs_knn.fit(x_train, y_train)
gs_knn.best_params_

# gs_knn.best_params_  n_neighbors = 4 
# weights = distance 
#leaf_size = 20 


{'leaf_size': 20}

In [23]:
a = []
for i in range (1, 20): 
    a.append(i * 0.01)
gs_nb = GridSearchCV (MultinomialNB(), param_grid = {'alpha': a}, scoring = 'accuracy', cv = 5)
gs_nb.fit(x_train, y_train)
gs_nb.best_params_
#best parameter is alpha = 0.1 

{'alpha': 0.1}

In [24]:
MultinomialNB(alpha = 0.1)
NB.fit(x_train, y_train)
ypred1 = NB.predict(x_train)
ypred = NB.predict(x_test)
ytrain = y_train[0:1739]
ytest = y_test[0:1739]
ypred1 = ypred1[0:1739]
ypred = ypred[0:1739]
print("Training Results:\n")
print(classification_report(ytrain, ypred1))
print("\nTesting Results:\n")
print(classification_report(ytest, ypred))

Training Results:

              precision    recall  f1-score   support

     Biology       0.73      0.93      0.82       723
   Chemistry       0.82      0.76      0.79       580
     Physics       0.95      0.60      0.74       436

    accuracy                           0.79      1739
   macro avg       0.83      0.77      0.78      1739
weighted avg       0.81      0.79      0.79      1739


Testing Results:

              precision    recall  f1-score   support

     Biology       0.64      0.89      0.75       730
   Chemistry       0.71      0.62      0.66       571
     Physics       0.89      0.46      0.61       438

    accuracy                           0.70      1739
   macro avg       0.75      0.66      0.67      1739
weighted avg       0.73      0.70      0.69      1739



In [25]:
model = KNC(n_neighbors= 4, weights = 'distance', leaf_size = 20 )
model.fit(x_train, y_train)
y_predicted = model.predict(x_test)
print("\nTesting Results:\n")
print(classification_report(y_test, y_predicted))


Testing Results:

              precision    recall  f1-score   support

     Biology       0.65      0.39      0.49       730
   Chemistry       0.39      0.69      0.49       571
     Physics       0.31      0.20      0.25       438

    accuracy                           0.44      1739
   macro avg       0.45      0.43      0.41      1739
weighted avg       0.48      0.44      0.43      1739



In [None]:
gs_knn = GridSearchCV (RandomForestClassifier(), param_grid = {'leaf_size': a}, scoring = 'accuracy', cv = 5)
gs_knn.fit(x_train, y_train)
gs_knn.best_params_