In [1]:
import pandas as pd 
import numpy as np
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix

### Data Facts and Import 

In [2]:
df_train = pd.read_csv('Dataset/SentimentAnalysisofTweetsthroughAltmetrics/train.csv')
df_test = pd.read_csv('Dataset/SentimentAnalysisofTweetsthroughAltmetrics/test.csv')

In [3]:
df_train.columns = ["Text", "Label"]
df_test.columns = ["Text", "Label"]

In [4]:
# remove the neutural.

#df_train= df_train[df_train['Is_Response'] != 0]
#df_test= df_test[df_test['Is_Response'] != 0]

In [5]:
df_train.shape

(732, 2)

In [6]:
df_test.shape

(314, 2)

In [7]:
df_train.head()

Unnamed: 0,Text,Label
0,good acronym copper nanotubes Definitely,-1
1,Author Michael Walz Wilhelm,0
2,GlycemicIndex diet restricted energy effective...,1
3,higher fibre intake partic cereal fibre reduce...,1
4,next life going research copper nanotubes CuNTs,-1


In [8]:
df_test.head()

Unnamed: 0,Text,Label
0,Yeah paper ebirdf,1
1,Nutrients Free Full Text,0
2,platform Bioinformatics paper advanced access ...,1
3,Regional distribution styrene analogues genera...,0
4,Duan naturally award Best Science Acronym year,-1


In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    732 non-null    object
 1   Label   732 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.6+ KB


In [10]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 314 entries, 0 to 313
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    314 non-null    object
 1   Label   314 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 5.0+ KB


In [11]:
df_train.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Label,732.0,0.019126,0.801616,-1.0,-1.0,0.0,1.0,1.0


In [12]:
df_test.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Label,314.0,0.009554,0.8013,-1.0,-1.0,0.0,1.0,1.0


### Data Cleaning / EDA

In [13]:
# ### Checking Missing values in the Data Set and printing the Percentage for Missing Values for Each Columns ###

# count = df_train.isnull().sum().sort_values(ascending=False)
# percentage = ((df_train.isnull().sum()/len(df_train)*100)).sort_values(ascending=False)
# missing_data = pd.concat([count, percentage], axis=1, keys=['Count','Percentage'])

# print('Count and percentage of missing values for the columns:')

# missing_data

In [14]:
# ### Checking for the Distribution of Default ###
# import matplotlib.pyplot as plt
# %matplotlib inline
# print('Percentage for default\n')
# print(round(df_train.Is_Response.value_counts(normalize=True)*100,2))
# round(df_train.Is_Response.value_counts(normalize=True)*100,2).plot(kind='bar')
# plt.title('Percentage Distributions by review type')
# plt.show()

In [15]:
#Removing columns
#df_train.drop(columns = ['User_ID', 'Browser_Used', 'Device_Used'], inplace = True)

In [16]:
# #This function converts to lower-case, removes square bracket, removes numbers and punctuation
# def text_clean_1(text):
#     text = text.lower()
#     text = re.sub('\[.*?\]', '', text)
#     text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
#     text = re.sub('\w*\d\w*', '', text)
#     return text

# cleaned1 = lambda x: text_clean_1(x)

In [17]:
# # Apply first level cleaning

# # Let's take a look at the updated text
# df_train['cleaned_description'] = pd.DataFrame(df_train.Description.apply(cleaned1))
# df_train.head(10)

In [18]:
# # Apply a second round of cleaning
# def text_clean_2(text):
#     text = re.sub('[‘’“”…]', '', text)
#     text = re.sub('\n', '', text)
#     return text

# cleaned2 = lambda x: text_clean_2(x)

In [19]:
# # Let's take a look at the updated text
# df_train['cleaned_description_new'] = pd.DataFrame(df_train['cleaned_description'].apply(cleaned2))
# df_train.head(10)

### spliting the data.

In [20]:
from sklearn.model_selection import train_test_split

# Independent_var = df_train.cleaned_description_new
# Dependent_var = df_train.Is_Response

# IV_train, IV_test, DV_train, DV_test = train_test_split(Independent_var, Dependent_var, test_size = 0.1, random_state = 225)

IV_train = df_train.Text
DV_train = df_train.Label
IV_test = df_test.Text
DV_test = df_test.Label


print('IV_train :', len(IV_train))
print('IV_test  :', len(IV_test))
print('DV_train :', len(DV_train))
print('DV_test  :', len(DV_test))


IV_train : 732
IV_test  : 314
DV_train : 732
DV_test  : 314


### Models 

### Logistic regression.

In [21]:
#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(IV_train) 
tfidf_test = tfidf_vectorizer.transform(IV_test)

model=LogisticRegression()
model.fit(tfidf_train, DV_train) 

In [22]:
#prediction

y_pred = model.predict(tfidf_test.todense())
y_prob = model.predict_proba(tfidf_test.todense())[:,1]



In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(y_pred, DV_test))
print("Precision : ", precision_score(y_pred, DV_test, average = 'weighted'))
print("Recall : ", recall_score(y_pred, DV_test, average = 'weighted'))
print("F1 Score : ", recall_score(y_pred, DV_test, average = 'weighted'))

Accuracy :  0.7579617834394905
Precision :  0.7593904743379041
Recall :  0.7579617834394905
F1 Score :  0.7579617834394905


In [24]:
import eli5
from eli5 import explain_weights, explain_prediction
from eli5.formatters import format_as_html, format_as_text, format_html_styles, fields
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())

  from IPython.core.display import display, HTML


In [25]:
eli5.show_weights(model, vec=tfidf_vectorizer, target_names=df_test['Label'], horizontal_layout=False)



Weight?,Feature
+2.746,acronym
+1.919,copper
+1.776,nanotubes
+1.617,cunts
+1.566,acronyms
+1.501,scientific
+1.329,unfortunate
+1.254,abbreviations
+1.164,abbreviation
+0.978,chinese

Weight?,Feature
+1.357,marinedebris
+1.257,marinelitter
+0.913,satiety
+0.657,effect
+0.598,disease
+0.597,marine
… 959 more positive …,… 959 more positive …
… 1139 more negative …,… 1139 more negative …
-0.597,finding
-0.603,cunt

Weight?,Feature
+1.335,study
+1.128,diet
+1.123,read
+1.106,based
+1.018,health
+0.928,interesting
+0.922,protein
+0.917,lower
+0.850,weight
+0.836,higher


In [26]:
show_html_expl(explain_prediction(model, df_test.Text[10], tfidf_vectorizer, target_names=df_test['Label']), force_weights=False, horizontal_layout=True)

Contribution?,Feature
-0.305,<BIAS>
-0.476,Highlighted in text (sum)

Contribution?,Feature
0.343,<BIAS>
-0.242,Highlighted in text (sum)

Contribution?,Feature
0.717,Highlighted in text (sum)
-0.038,<BIAS>


### Support Vector Classfication (SVM)

In [None]:
from sklearn.svm import SVC


#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(IV_train) 
tfidf_test = tfidf_vectorizer.transform(IV_test)

model=SVC(probability=True)
model.fit(tfidf_train, DV_train) 

In [None]:
#prediction

y_pred = model.predict(tfidf_test.todense())
y_prob = model.predict_proba(tfidf_test.todense()) # [:,1]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(y_pred, DV_test))
print("Precision : ", precision_score(y_pred, DV_test, average = 'weighted'))
print("Recall : ", recall_score(y_pred, DV_test, average = 'weighted'))
print("F1 Score : ", recall_score(y_pred, DV_test, average = 'weighted'))

### MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB

#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(IV_train) 
tfidf_test = tfidf_vectorizer.transform(IV_test)

model=MultinomialNB()
model.fit(tfidf_train, DV_train) 

In [None]:
#prediction

y_pred = model.predict(tfidf_test.todense())
y_prob = model.predict_proba(tfidf_test.todense())[:,1]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(y_pred, DV_test))
print("Precision : ", precision_score(y_pred, DV_test, average = 'weighted'))
print("Recall : ", recall_score(y_pred, DV_test, average = 'weighted'))
print("F1 Score : ", recall_score(y_pred, DV_test, average = 'weighted'))

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier 

#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(IV_train) 
tfidf_test = tfidf_vectorizer.transform(IV_test)

model=KNeighborsClassifier()
model.fit(tfidf_train, DV_train) 

In [None]:
#prediction

y_pred = model.predict(tfidf_test.todense())
y_prob = model.predict_proba(tfidf_test.todense())[:,1]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(y_pred, DV_test))
print("Precision : ", precision_score(y_pred, DV_test, average = 'weighted'))
print("Recall : ", recall_score(y_pred, DV_test, average = 'weighted'))
print("F1 Score : ", recall_score(y_pred, DV_test, average = 'weighted'))

In [None]:
from sklearn.pipeline import make_pipeline

# Explaining the predictions and important features for predicting the label 1
c = make_pipeline(tfidf_vectorizer, model)

### Deceion Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(IV_train) 
tfidf_test = tfidf_vectorizer.transform(IV_test)

model=DecisionTreeClassifier(random_state = 1)
model.fit(tfidf_train, DV_train) 

In [None]:
#prediction

y_pred = model.predict(tfidf_test.todense())
y_prob = model.predict_proba(tfidf_test.todense())[:,1]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(y_pred, DV_test))
print("Precision : ", precision_score(y_pred, DV_test, average = 'weighted'))
print("Recall : ", recall_score(y_pred, DV_test, average = 'weighted'))
print("F1 Score : ", recall_score(y_pred, DV_test, average = 'weighted'))

### Random Forest 

In [None]:
# https://www.kdnuggets.com/2022/01/explain-nlp-models-lime.html

from sklearn.ensemble import RandomForestClassifier

#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(IV_train) 
tfidf_test = tfidf_vectorizer.transform(IV_test)

model=RandomForestClassifier(n_estimators = 100, random_state = 10)
model.fit(tfidf_train, DV_train) 

In [None]:
#prediction

y_pred = model.predict(tfidf_test.todense())
y_prob = model.predict_proba(tfidf_test.todense())[:,1]

In [None]:

from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(y_pred, DV_test))
print("Precision : ", precision_score(y_pred, DV_test, average = 'weighted'))
print("Recall : ", recall_score(y_pred, DV_test, average = 'weighted'))
print("F1 Score : ", recall_score(y_pred, DV_test, average = 'weighted'))