In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/content/dataset.csv', encoding='utf-8')

In [None]:
df.head()

Unnamed: 0,text,label
0,I recently went through a breakup and she said...,Not Sucidal
1,"I do not know how to navigate these feelings, ...",Not Sucidal
2,"So I have been with my bf for 5 months , and h...",Not Sucidal
3,I am so exhausted of this. Just when I think I...,The text contains references to self-harm
4,I have been severly bullied since i was 5 till...,Not Sucidal


In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def preprocess_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

In [None]:
X = df["text"]
y = df["label"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
X_train = X_train.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 

In [None]:
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

In [None]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.512 total time= 2.5min
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.512 total time= 2.5min
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.512 total time= 2.5min
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.512 total time= 2.5min
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.512 total time= 2.5min
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.569 total time= 1.7min
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.560 total time= 1.7min
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.557 total time= 1.7min
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.556 total time= 1.7min
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.550 total time= 1.7min
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.688 total time= 1.3min
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [None]:
print(grid.best_params_)

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}


In [None]:
final_model = SVC(C=10, gamma=0.001, kernel='rbf')

In [None]:
final_model.fit(X_train, y_train)

In [None]:
y_preds = final_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
print(classification_report(y_test, y_preds,))

                                           precision    recall  f1-score   support

                              Not Sucidal       0.75      0.66      0.70      3073
The text contains references to self-harm       0.69      0.77      0.73      3036

                                 accuracy                           0.71      6109
                                macro avg       0.72      0.72      0.71      6109
                             weighted avg       0.72      0.71      0.71      6109



In [None]:
confusion_matrix(y_test, y_preds)

array([[2022, 1051],
       [ 692, 2344]])

In [None]:
accuracy_score(y_test, y_preds)

0.7146832542150925

In [None]:
import pickle

In [None]:
filename = 'model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(final_model, file)