In [1]:
!pip install spacy pandas scikit-learn
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [9]:
import pandas as pd

train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

print(train_data.head())
print(test_data.head())


       Id                                            Comment      Topic
0   0x840  A few things. You might have negative- frequen...    Biology
1   0xbf0  Is it so hard to believe that there exist part...    Physics
2  0x1dfc                                     There are bees    Biology
3   0xc7e  I'm a medication technician. And that's alot o...    Biology
4   0xbba                     Cesium is such a pretty metal.  Chemistry
       Id                                            Comment      Topic
0  0x1aa9  Personally I have no idea what my IQ is. I’ve ...    Biology
1   0x25e  I'm skeptical. A heavier lid would be needed t...    Physics
2  0x1248  I think I have 100 cm of books on the subject....    Biology
3   0x2b9  Is chemistry hard in uni. Ive read somewhere t...  Chemistry
4  0x24af  In addition to the other comment, you can crit...    Physics


In [10]:
import spacy

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

train_data['processed_text'] = train_data['Comment'].apply(preprocess_text)
test_data['processed_text'] = test_data['Comment'].apply(preprocess_text)


In [11]:
train_data.head()

Unnamed: 0,Id,Comment,Topic,processed_text
0,0x840,A few things. You might have negative- frequen...,Biology,thing negative- frequency dependent selection ...
1,0xbf0,Is it so hard to believe that there exist part...,Physics,hard believe exist particular detect invent fa...
2,0x1dfc,There are bees,Biology,bee
3,0xc7e,I'm a medication technician. And that's alot o...,Biology,medication technician alot drug liver probably...
4,0xbba,Cesium is such a pretty metal.,Chemistry,cesium pretty metal


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data['processed_text'])
y_train = train_data['Topic']

X_test = vectorizer.transform(test_data['processed_text'])
y_test = test_data['Topic']

In [20]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

In [21]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8284993694829761
              precision    recall  f1-score   support

     Biology       0.81      0.88      0.84       614
   Chemistry       0.79      0.84      0.81       506
     Physics       0.93      0.75      0.83       466

    accuracy                           0.83      1586
   macro avg       0.84      0.82      0.83      1586
weighted avg       0.84      0.83      0.83      1586



In [26]:
parameters = {
    'alpha': [0.01, 0.1, 1, 10],

}

grid_search = GridSearchCV(model, parameters, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)

y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Best Parameters: {'alpha': 0.1}
Accuracy: 0.8524590163934426
              precision    recall  f1-score   support

     Biology       0.89      0.85      0.87       614
   Chemistry       0.81      0.85      0.83       506
     Physics       0.86      0.86      0.86       466

    accuracy                           0.85      1586
   macro avg       0.85      0.85      0.85      1586
weighted avg       0.85      0.85      0.85      1586



In [17]:
from sklearn.linear_model import LogisticRegression
model_2 = LogisticRegression()
model_2.fit(X_train, y_train)
y_pred = model_2.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8398486759142497
              precision    recall  f1-score   support

     Biology       0.85      0.86      0.85       614
   Chemistry       0.80      0.83      0.81       506
     Physics       0.87      0.83      0.85       466

    accuracy                           0.84      1586
   macro avg       0.84      0.84      0.84      1586
weighted avg       0.84      0.84      0.84      1586



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
from sklearn.ensemble import RandomForestClassifier
model_3 =RandomForestClassifier()
model_3.fit(X_train, y_train)
y_pred = model_3.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7938209331651954
              precision    recall  f1-score   support

     Biology       0.81      0.79      0.80       614
   Chemistry       0.74      0.77      0.76       506
     Physics       0.84      0.82      0.83       466

    accuracy                           0.79      1586
   macro avg       0.80      0.79      0.79      1586
weighted avg       0.79      0.79      0.79      1586



In [27]:
import joblib

joblib.dump(best_model, 'best_model.pkl')

['best_model.pkl']

In [29]:
train_data.shape

(8695, 4)

In [30]:
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']