In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df=pd.read_csv('/content/drive/MyDrive/Copy of FinallyLemmas.csv')

In [4]:
X = df.drop(columns='label')
y = df['label']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test,y_test, test_size=0.5, random_state=42)


# Train 70%
# Validation 15%
# Testing 15%

In [6]:
tfidf_vectorizer = TfidfVectorizer()

train_feature_tfidf = tfidf_vectorizer.fit_transform(X_train.lemmaText)

test_feature_tfidf = tfidf_vectorizer.transform(X_val.lemmaText)

svm_classifier = SVC(kernel='rbf')

svm_classifier.fit(train_feature_tfidf, y_train)

svm_prediction = svm_classifier.predict(test_feature_tfidf)

svm_accuracy = accuracy_score(y_val, svm_prediction)
print(f"SVM Accuracy: {svm_accuracy:.2f}")

print(classification_report(y_val, svm_prediction))


SVM Accuracy: 0.97
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       983
           1       0.98      0.99      0.98       973
           2       0.94      0.95      0.95       970
           3       0.96      0.99      0.98       972
           4       0.96      0.95      0.96       961
           5       0.96      0.98      0.97       993
           6       0.99      0.99      0.99       934
           7       0.97      0.94      0.95       950
           8       0.98      0.96      0.97       967

    accuracy                           0.97      8703
   macro avg       0.97      0.97      0.97      8703
weighted avg       0.97      0.97      0.97      8703



In [7]:
Testing = tfidf_vectorizer.transform(X_test.lemmaText)


In [8]:
svm_prediction = svm_classifier.predict(Testing)

svm_accuracy = accuracy_score(y_test, svm_prediction)
print(f"SVM Accuracy: {svm_accuracy:.2f}")

print(classification_report(y_test, svm_prediction))


SVM Accuracy: 0.97
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       995
           1       0.98      0.99      0.98       971
           2       0.94      0.96      0.95       941
           3       0.95      0.98      0.96       920
           4       0.96      0.95      0.96      1015
           5       0.95      0.97      0.96       961
           6       1.00      0.99      0.99       977
           7       0.97      0.94      0.95       938
           8       0.98      0.96      0.97       986

    accuracy                           0.97      8704
   macro avg       0.97      0.97      0.97      8704
weighted avg       0.97      0.97      0.97      8704



In [9]:
import joblib
joblib.dump(svm_classifier, 'LastSVM.pkl')


['LastSVM.pkl']

In [12]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(random_state=42)
lr_clf.fit(train_feature_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
training_accuracy = lr_clf.score(train_feature_tfidf, y_train)
print("Training Accuracy:", training_accuracy)

Training Accuracy: 0.9825930667717155


In [14]:
y_pred = lr_clf.predict(test_feature_tfidf)
val_accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.9662185453291968


In [15]:
lr_prediction = lr_clf.predict(Testing)

lr_accuracy = accuracy_score(y_test, lr_prediction)
print(f"LR Accuracy: {lr_accuracy:.2f}")

print(classification_report(y_test, lr_prediction))


LR Accuracy: 0.96
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       995
           1       0.98      0.99      0.98       971
           2       0.93      0.96      0.94       941
           3       0.94      0.97      0.96       920
           4       0.96      0.95      0.95      1015
           5       0.95      0.97      0.96       961
           6       1.00      0.99      0.99       977
           7       0.97      0.93      0.95       938
           8       0.97      0.95      0.96       986

    accuracy                           0.96      8704
   macro avg       0.96      0.96      0.96      8704
weighted avg       0.96      0.96      0.96      8704



In [17]:
joblib.dump(lr_clf, 'LastLR.pkl')


['LastLR.pkl']

In [21]:
import xgboost as xgb


xgb_classifier = xgb.XGBClassifier()

xgb_classifier.fit(train_feature_tfidf, y_train)

xgb_prediction = xgb_classifier.predict(test_feature_tfidf)

xgb_accuracy = accuracy_score(y_val, xgb_prediction)
print(f"XGBoost Accuracy: {xgb_accuracy:.2f}")

print(classification_report(y_val, xgb_prediction))


XGBoost Accuracy: 0.96
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       983
           1       0.98      0.98      0.98       973
           2       0.92      0.95      0.93       970
           3       0.97      0.97      0.97       972
           4       0.96      0.93      0.95       961
           5       0.95      0.97      0.96       993
           6       0.99      0.99      0.99       934
           7       0.94      0.93      0.94       950
           8       0.97      0.95      0.96       967

    accuracy                           0.96      8703
   macro avg       0.96      0.96      0.96      8703
weighted avg       0.96      0.96      0.96      8703



In [22]:
xgb_prediction = xgb_classifier.predict(Testing)

xgb_accuracy = accuracy_score(y_test, xgb_prediction)
print(f"XGBoost Accuracy: {xgb_accuracy:.2f}")

print(classification_report(y_test, xgb_prediction))


XGBoost Accuracy: 0.96
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       995
           1       0.98      0.99      0.98       971
           2       0.92      0.96      0.94       941
           3       0.95      0.97      0.96       920
           4       0.98      0.94      0.96      1015
           5       0.95      0.97      0.96       961
           6       0.99      0.98      0.99       977
           7       0.95      0.93      0.94       938
           8       0.96      0.96      0.96       986

    accuracy                           0.96      8704
   macro avg       0.96      0.96      0.96      8704
weighted avg       0.96      0.96      0.96      8704



In [23]:
joblib.dump(xgb_classifier, 'LastXGB.pkl')


['LastXGB.pkl']

In [24]:
from sklearn.naive_bayes import MultinomialNB

# Create a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(train_feature_tfidf, y_train)

# Make predictions on the validation data
nb_prediction = nb_classifier.predict(test_feature_tfidf)

# Calculate accuracy
nb_accuracy = accuracy_score(y_val, nb_prediction)
print(f"Naive Bayes Accuracy: {nb_accuracy:.2f}")

# You can also print other metrics like classification report
print(classification_report(y_val, nb_prediction))


Naive Bayes Accuracy: 0.94
              precision    recall  f1-score   support

           0       0.99      0.92      0.95       983
           1       0.95      0.97      0.96       973
           2       0.86      0.93      0.90       970
           3       0.96      0.96      0.96       972
           4       0.90      0.92      0.91       961
           5       0.88      0.98      0.92       993
           6       0.99      0.98      0.99       934
           7       0.98      0.87      0.92       950
           8       0.94      0.92      0.93       967

    accuracy                           0.94      8703
   macro avg       0.94      0.94      0.94      8703
weighted avg       0.94      0.94      0.94      8703



In [26]:
nb_prediction = nb_classifier.predict(Testing)

nb_accuracy = accuracy_score(y_test, nb_prediction)
print(f"NB Accuracy: {nb_accuracy:.2f}")

print(classification_report(y_test, nb_prediction))


NB Accuracy: 0.94
              precision    recall  f1-score   support

           0       0.99      0.90      0.95       995
           1       0.96      0.98      0.97       971
           2       0.88      0.94      0.91       941
           3       0.95      0.95      0.95       920
           4       0.92      0.94      0.93      1015
           5       0.86      0.97      0.91       961
           6       1.00      0.98      0.99       977
           7       0.98      0.86      0.92       938
           8       0.93      0.91      0.92       986

    accuracy                           0.94      8704
   macro avg       0.94      0.94      0.94      8704
weighted avg       0.94      0.94      0.94      8704



In [27]:
joblib.dump(xgb_classifier, 'LastNB.pkl')


['LastNB.pkl']