In [2]:
from sklearn.svm import SVC
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from normalizer import Normalizer
from results import ModelEvaluation

    Load the data

In [3]:
data = pd.read_csv("../dataset/TwitterDataset.txt", sep="\t")

    Let's describe the data

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39776 entries, 0 to 39775
Data columns (total 3 columns):
 #   Column                                                                                                                          Non-Null Count  Dtype 
---  ------                                                                                                                          --------------  ----- 
 0   TrainSen                                                                                                                        39776 non-null  object
 1   0                                                                                                                               39776 non-null  int64 
 2   @0430yes i hope youre lurking rn. i want to listen to hallucination & wanna love you again live someday, pretty please?! 😭 😭 😭  39776 non-null  object
dtypes: int64(1), object(2)
memory usage: 932.4+ KB


    There are no column names let's provide column names.

In [5]:
data.columns = ["sample_type", "label", "text"] #changes column names of the dataset.
data = data.drop(columns=["sample_type"]) #drop the redudant column.
data = data.sample(frac=1, random_state=42).reset_index(drop=True) #shuffle the dataset.

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39776 entries, 0 to 39775
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   39776 non-null  int64 
 1   text    39776 non-null  object
dtypes: int64(1), object(1)
memory usage: 621.6+ KB


In [7]:
data.head(10)

Unnamed: 0,label,text
0,1,About to make up an entire essay good think I'...
1,0,@bonnielazzara @gamegirl404 @claudia_cpucci @H...
2,1,American Literature has been an interesting co...
3,1,@rachelkcollier sound like backing vocalists #...
4,1,I said I was gonna wash my face before I went ...
5,1,I used to think i was good at multi-tasking. T...
6,1,It's okay to burn a flag with your 1st Amendme...
7,0,🌸💐🌸💐🌸 HAPPY FRIDAY Blessings ~Love ~Light Wish...
8,1,last picture with my beautifull sister 3 R. I....
9,1,Side joy of business travel : spur of the mome...


    Get sparse matrix representation of the text:

In [8]:
normarlizer = Normalizer()
matrix = normarlizer.vectorize(pd.DataFrame({"text": data["text"]}))

[INFO] Trying to create a sparse matrix for text, using an instance of TfIdf_vectorizer
[INFO] Extracting columns containing text from dataframe.
[INFO] Successfully extracted text columns from the dataset.
[INFO] Applying Normalization over text:
[INFO]       - Converting Text into lower case for caseconsistency.
[INFO]       - Extracting only words containing alphabets.
[INFO] Text Normalization is now complete.
[INFO] Fitting the vecotirzer to given text.
[INFO] Transforming the text into a sparse matrix.
[INFO] Sparse Matrix has been successfully created over the text given as input.


    Let's look at the matrix:

In [9]:
matrix

<39776x34292 sparse matrix of type '<class 'numpy.float64'>'
	with 550273 stored elements in Compressed Sparse Row format>

    Let's hold out some data for testing.

In [10]:
y = data["label"]
X = matrix.toarray()
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    Let's now fit a Support Vector Classifier model to this:

In [11]:
SV_classifier = SVC()

In [12]:
SV_classifier = SV_classifier.fit(X_train, y_train)

    Let's test the SCV model:

In [None]:
y_pred = SV_classifier.predict(X_test)
results = ModelEvaluation()

In [None]:
results.classification_report(y_test, y_pred)

    Let's look at the Confusion Matrix to see how well it can differentiate between texts.

In [None]:
results.get_cm(y_true=y_test, y_pred=y_pred)

In [None]:
y_proba = SV_classifier.predict_proba(X_test)[:, 1]
y_proba[:10]
threshold = results.get_roc(y_true=y_test, y_scores=y_proba)

    Let's cross validate the model:

In [None]:
results.cross_validation_score(SV_classifier, X, y, "accuracy")

In [None]:
import joblib

joblib.dump(SV_classifier, "../TwitterDataModels/TrainedModels/SupportVectorClassifier.joblib")