In [48]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import f1_score
import plotly.express as px
import plotly.graph_objs as go

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [49]:
# FOR REPRODUCIBILTY
np.random.seed(0)
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [50]:
data = pd.read_csv("/kaggle/input/text-dataset/FinalDataset.csv")
data.head()

Unnamed: 0,sentence,class
0,What city in the United States has the highest...,0
1,"At work, wishing I was out on the boat",0
2,A smile is a curve that sets everything straig...,0
3,Does sleep quality mediate the association bet...,0
4,What city was found on the west bank of the ri...,0


In [51]:
data['class'].value_counts()

class
0    400015
1    376930
Name: count, dtype: int64

In [52]:
data['sentence'] = data['sentence'].str.lower()

In [53]:
# Creating X data and y labels
X = data['sentence']
y = data['class']

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state=42)

In [55]:
# Applying TFIDF feature extraction on X
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)

X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [56]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(629325, 287164) (629325,)
(69925, 287164) (69925,)
(77695, 287164) (77695,)


In [57]:
model = LinearSVC(random_state=42)
model.fit(X_train, y_train)

In [58]:
y_predict = model.predict(X_train)
print("Train AUC-ROC: ", roc_auc_score(y_train, y_predict))

Train AUC-ROC:  0.9183614862140068


In [59]:
y_predict = model.predict(X_val)
print("Val AUC-ROC: ", roc_auc_score(y_val, y_predict))

Val AUC-ROC:  0.8473488872607955


In [60]:
y_predict = model.predict(X_test)
print("Test AUC-ROC: ", roc_auc_score(y_test, y_predict))

Test AUC-ROC:  0.8481249953793123


In [61]:
train, test = train_test_split(data, test_size = 0.1, random_state=42)

data = {}
count = 0
errors = []
aucroc = []
f1 = []
key_list = []

def getRangeRows(df, x, y):
    df['word_lengths'] = df['sentence'].apply(lambda x: len(x.split(" ")))
    df = df[(df['word_lengths'] >= x) & (df['word_lengths'] <= y)]
    return df
start = 10
while start < 200:
    if start >= 140:
        newDf = getRangeRows(test, start, start+19)
    elif start >= 40:
        newDf = getRangeRows(test, start, start+9)
    else:
        newDf = getRangeRows(test, start, start+4)
    
    count = 0
    total_acc_test = 0
    total_f1_score = 0
    
    X_test = vectorizer.transform(newDf['sentence'].values)
    y_predict = model.predict(X_test)
    total_acc_test = roc_auc_score(newDf['class'].values, y_predict)
    total_f1_score = f1_score(newDf['class'].values, y_predict)
    
    if start >= 140:
        key = str(start) + '-' + str(start + 19)
        start += 20
    elif start >= 40:
        key = str(start) + '-' + str(start + 9)
        start += 10
    else:
        key = str(start) + '-' + str(start + 4)
        start += 5

    aucroc.append(total_acc_test)
    f1.append(total_f1_score)
    key_list.append(key)
    print("For range of text length:", key)    
    print("Test AUCROC:", total_acc_test)
    print("F1 score:", total_f1_score)
    print("======================================")
    print()

For range of text length: 10-14
Test AUCROC: 0.7991188794188442
F1 score: 0.7742320633886899

For range of text length: 15-19
Test AUCROC: 0.8265355785698866
F1 score: 0.8241980436089404

For range of text length: 20-24
Test AUCROC: 0.8545378485250628
F1 score: 0.854733049459548

For range of text length: 25-29
Test AUCROC: 0.8849566576862381
F1 score: 0.875496104659709

For range of text length: 30-34
Test AUCROC: 0.878530353221424
F1 score: 0.901613413476748

For range of text length: 35-39
Test AUCROC: 0.8390019168039816
F1 score: 0.8690928843020098

For range of text length: 40-49
Test AUCROC: 0.8561695646133183
F1 score: 0.8664353859496965

For range of text length: 50-59
Test AUCROC: 0.8399407985973426
F1 score: 0.8753762793497893

For range of text length: 60-69
Test AUCROC: 0.8694634529434864
F1 score: 0.9217391304347826

For range of text length: 70-79
Test AUCROC: 0.8285399232651581
F1 score: 0.9411764705882352

For range of text length: 80-89
Test AUCROC: 0.8071116408422727


In [62]:
fig = go.Figure()

fig.add_trace(go.Bar(x=key_list, y=aucroc, name='AUC-ROC'))
fig.add_trace(go.Bar(x=key_list, y=f1, name='F1 score'))

fig.update_xaxes(title_text='Word length range')
fig.update_yaxes(title_text='AUC-ROC/F1 score')

fig.update_layout(title='Performance of model over texts of different lengths', title_x=0.5)
fig.update_yaxes(range=[0, 1.2])
fig.update_layout(width=1000, height=500)

fig.show()