In [2]:
import pandas as pd 
import numpy as np
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix

### Data Facts and Import 

In [3]:
df_train = pd.read_csv('Dataset/SentimentAnalysisofTweetsthroughAltmetrics/train.csv')
df_test = pd.read_csv('Dataset/SentimentAnalysisofTweetsthroughAltmetrics/test.csv')

In [4]:
df_train.columns = ["Text", "Label"]
df_test.columns = ["Text", "Label"]

In [5]:
# remove the neutural.
df_train= df_train[df_train['Label'] != 0]
df_test= df_test[df_test['Label'] != 0]

In [6]:
df_train.shape

(470, 2)

In [7]:
df_train.head()

Unnamed: 0,Text,Label
0,good acronym copper nanotubes Definitely,-1
2,GlycemicIndex diet restricted energy effective...,1
3,higher fibre intake partic cereal fibre reduce...,1
4,next life going research copper nanotubes CuNTs,-1
6,Bean rich diet produces equivalent weight loss...,1


In [8]:
df_test.head()

Unnamed: 0,Text,Label
0,Yeah paper ebirdf,1
2,platform Bioinformatics paper advanced access ...,1
4,Duan naturally award Best Science Acronym year,-1
5,Everything Chinese turns swear word think karma,-1
6,dear difficulties finding scientific abbreviat...,-1


In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 470 entries, 0 to 730
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    470 non-null    object
 1   Label   470 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.0+ KB


In [10]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 313
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    201 non-null    object
 1   Label   201 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.7+ KB


In [11]:
df_train.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Label,470.0,0.029787,1.000621,-1.0,-1.0,1.0,1.0,1.0


In [12]:
df_test.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Label,201.0,0.014925,1.002385,-1.0,-1.0,1.0,1.0,1.0


### Data Cleaning / EDA

In [12]:
# ### Checking Missing values in the Data Set and printing the Percentage for Missing Values for Each Columns ###

# count = df_train.isnull().sum().sort_values(ascending=False)
# percentage = ((df_train.isnull().sum()/len(df_train)*100)).sort_values(ascending=False)
# missing_data = pd.concat([count, percentage], axis=1,
# keys=['Count','Percentage'])

# print('Count and percentage of missing values for the columns:')

# missing_data

In [13]:
# ### Checking for the Distribution of Default ###
# import matplotlib.pyplot as plt
# %matplotlib inline
# print('Percentage for default\n')
# print(round(df_train.Is_Response.value_counts(normalize=True)*100,2))
# round(df_train.Is_Response.value_counts(normalize=True)*100,2).plot(kind='bar')
# plt.title('Percentage Distributions by review type')
# plt.show()

In [15]:
#Removing columns
#df_train.drop(columns = ['User_ID', 'Browser_Used', 'Device_Used'], inplace = True)

In [14]:
# #This function converts to lower-case, removes square bracket, removes numbers and punctuation
# def text_clean_1(text):
#     text = text.lower()
#     text = re.sub('\[.*?\]', '', text)
#     text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
#     text = re.sub('\w*\d\w*', '', text)
#     return text

# cleaned1 = lambda x: text_clean_1(x)

In [15]:
# # Apply first level cleaning

# # Let's take a look at the updated text
# df_train['cleaned_description'] = pd.DataFrame(df_train.Description.apply(cleaned1))
# df_train.head(10)

In [16]:
# # Apply a second round of cleaning
# def text_clean_2(text):
#     text = re.sub('[‘’“”…]', '', text)
#     text = re.sub('\n', '', text)
#     return text

# cleaned2 = lambda x: text_clean_2(x)

In [17]:
# # Let's take a look at the updated text
# df_train['cleaned_description_new'] = pd.DataFrame(df_train['cleaned_description'].apply(cleaned2))
# df_train.head(10)

### Spliting the data.

In [13]:
from sklearn.model_selection import train_test_split

# Independent_var = df_train.cleaned_description_new
# Dependent_var = df_train.Is_Response

# IV_train, IV_test, DV_train, DV_test = train_test_split(Independent_var, Dependent_var, test_size = 0.1, random_state = 225)

IV_train = df_train.Text
DV_train = df_train.Label
IV_test = df_test.Text
DV_test = df_test.Label

print('IV_train :', len(IV_train))
print('IV_test  :', len(IV_test))
print('DV_train :', len(DV_train))
print('DV_test  :', len(DV_test))

IV_train : 470
IV_test  : 201
DV_train : 470
DV_test  : 201


### Model training 

### Logistic Regression

In [14]:
#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(IV_train) 
tfidf_test = tfidf_vectorizer.transform(IV_test)

In [15]:
model=LogisticRegression()
model.fit(tfidf_train, DV_train)

LogisticRegression()

In [16]:
model.intercept_

array([0.2971307])

In [17]:
model.coef_.shape

(1, 1368)

In [18]:
y_pred = model.predict(tfidf_test.todense())
y_prob = model.predict_proba(tfidf_test.todense())



In [19]:
y_pred

array([ 1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1,
       -1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
       -1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1, -1, -1,
       -1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1,  1,
        1,  1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,  1,
        1, -1, -1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1,
        1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,
        1,  1,  1,  1, -1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1,
        1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
        1, -1,  1,  1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,
       -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1,
       -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1,  1,  1,  1],
      dtype=int64)

In [19]:
y_prob
#-1,1

array([[0.45353657, 0.54646343],
       [0.31561633, 0.68438367],
       [0.73676329, 0.26323671],
       [0.45003932, 0.54996068],
       [0.88333974, 0.11666026],
       [0.14373418, 0.85626582],
       [0.320365  , 0.679635  ],
       [0.2613947 , 0.7386053 ],
       [0.86096358, 0.13903642],
       [0.87109201, 0.12890799],
       [0.17951721, 0.82048279],
       [0.29340654, 0.70659346],
       [0.42625906, 0.57374094],
       [0.87187832, 0.12812168],
       [0.27903955, 0.72096045],
       [0.5886459 , 0.4113541 ],
       [0.53565041, 0.46434959],
       [0.82022174, 0.17977826],
       [0.7579807 , 0.2420193 ],
       [0.60208667, 0.39791333],
       [0.78494706, 0.21505294],
       [0.24223796, 0.75776204],
       [0.40143877, 0.59856123],
       [0.3501597 , 0.6498403 ],
       [0.13795967, 0.86204033],
       [0.72045319, 0.27954681],
       [0.57239175, 0.42760825],
       [0.91905299, 0.08094701],
       [0.28070994, 0.71929006],
       [0.32791226, 0.67208774],
       [0.

In [20]:
confusion_matrix(y_pred, DV_test)

array([[91,  6],
       [ 8, 96]], dtype=int64)

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score  , f1_score

print("Accuracy : ", accuracy_score(y_pred, DV_test))
print("Precision : ", precision_score(y_pred, DV_test, average = 'weighted'))
print("Recall : ", recall_score(y_pred, DV_test, average = 'weighted'))
print("F1 Score : ", recall_score(y_pred, DV_test, average = 'weighted'))

Accuracy :  0.9303482587064676
Precision :  0.9305670104616549
Recall :  0.9303482587064676
F1 Score :  0.9303482587064676


In [22]:
df_test.head(20)

Unnamed: 0,Text,Label
0,Yeah paper ebirdf,1
2,platform Bioinformatics paper advanced access ...,1
4,Duan naturally award Best Science Acronym year,-1
5,Everything Chinese turns swear word think karma,-1
6,dear difficulties finding scientific abbreviat...,-1
8,week study obese patients High fibre carb heal...,1
9,Beyond hours Researchers argue deliberate prac...,1
10,good article interested natural alternatives t...,1
11,Single walled copper nanotubes Without doubt u...,-1
12,poor choice scientific acronym ever,-1


In [24]:
!pip install tabulate
!pip install jinja2==3.0.3
!pip install scikit-learn scipy matplotlib

!pip install sklearn


Collecting jinja2==3.0.3
  Using cached Jinja2-3.0.3-py3-none-any.whl (133 kB)
Installing collected packages: jinja2
  Attempting uninstall: jinja2
    Found existing installation: Jinja2 3.0.0
    Uninstalling Jinja2-3.0.0:
      Successfully uninstalled Jinja2-3.0.0
Successfully installed jinja2-3.0.3


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-project 0.10.2 requires ruamel-yaml, which is not installed.
jupyter-server 1.13.5 requires pywinpty<2; os_name == "nt", but you have pywinpty 2.0.2 which is incompatible.




In [25]:
!pip3 install --upgrade jinja2==3.0

Collecting jinja2==3.0
  Using cached Jinja2-3.0.0-py3-none-any.whl (133 kB)
Installing collected packages: jinja2
  Attempting uninstall: jinja2
    Found existing installation: Jinja2 3.0.3
    Uninstalling Jinja2-3.0.3:
      Successfully uninstalled Jinja2-3.0.3
Successfully installed jinja2-3.0.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-project 0.10.2 requires ruamel-yaml, which is not installed.
jupyter-server 1.13.5 requires pywinpty<2; os_name == "nt", but you have pywinpty 2.0.2 which is incompatible.


In [29]:
# please make sure, using the python 3.9

In [23]:
import eli5
from eli5 import explain_weights, explain_prediction
from eli5.formatters import format_as_html, format_as_text, format_html_styles, fields
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())

  from IPython.core.display import display, HTML


In [24]:
eli5.show_weights(model, vec=tfidf_vectorizer, target_names=df_test['Label'], horizontal_layout=False)



Weight?,Feature
+1.487,study
+1.306,diet
+1.015,article
+0.982,based
+0.930,protein
+0.886,read
+0.885,nutrition
+0.884,health
… 845 more positive …,… 845 more positive …
… 504 more negative …,… 504 more negative …


In [25]:
# Case 1 Correct: Actual Label = 1, Predicted Label = 1,  
show_html_expl( explain_prediction(model, df_test.Text[10], tfidf_vectorizer, target_names=df_test['Label']), force_weights=False, horizontal_layout=True)

Contribution?,Feature
0.742,Highlighted in text (sum)
0.297,<BIAS>


In [26]:
# Case 2 Correct: Actual Label = -1, Predicted Label = -1
show_html_expl( explain_prediction(model, df_test.Text[4], tfidf_vectorizer, target_names=df_test['Label']), force_weights=False, horizontal_layout=True)



Contribution?,Feature
1.326,Highlighted in text (sum)
-0.297,<BIAS>


In [27]:
# Case 3 Wrong: Actual Label = -1, Predicted Label =-1, 

show_html_expl( explain_prediction(model, df_test.Text[5], tfidf_vectorizer, target_names=df_test['Label']), force_weights=False, horizontal_layout=True)



Contribution?,Feature
0.297,<BIAS>
-0.097,Highlighted in text (sum)


### Support Vector Machine

In [62]:
from sklearn.svm import SVC


#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(IV_train) 
tfidf_test = tfidf_vectorizer.transform(IV_test)

model=SVC(probability=True)
model.fit(tfidf_train, DV_train) 

SVC(probability=True)

In [63]:

y_pred = model.predict(tfidf_test.todense())
y_prob = model.predict_proba(tfidf_test.todense()) # [:,1]



In [64]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(y_pred, DV_test))
print("Precision : ", precision_score(y_pred, DV_test, average = 'weighted'))
print("Recall : ", recall_score(y_pred, DV_test, average = 'weighted'))
print("F1 Score : ", recall_score(y_pred, DV_test, average = 'weighted'))

Accuracy :  0.9104477611940298
Precision :  0.9106724251588167
Recall :  0.9104477611940298
F1 Score :  0.9104477611940298


In [65]:
import eli5
from eli5 import explain_weights, explain_prediction
from eli5.formatters import format_as_html, format_as_text, format_html_styles, fields
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())

  from IPython.core.display import display, HTML


In [33]:
eli5.show_weights(model, vec=tfidf_vectorizer, target_names=df_test['Label'], horizontal_layout=False)



Weight?,Feature
+1.487,study
+1.306,diet
+1.015,article
+0.982,based
+0.930,protein
+0.886,read
+0.885,nutrition
+0.884,health
… 845 more positive …,… 845 more positive …
… 504 more negative …,… 504 more negative …


In [37]:
show_html_expl(explain_prediction(model, df_test.Text[10], tfidf_vectorizer, target_names=df_test['Label']), force_weights=False, horizontal_layout=True)



Contribution?,Feature
0.742,Highlighted in text (sum)
0.297,<BIAS>


### GaussianNB

In [None]:
# # Create the tf-idf vectorizer
# vectorizer = TfidfVectorizer(strip_accents='ascii')

# # First fit the vectorizer with our training set
# tfidf_train = vectorizer.fit_transform(IV_train)

# # Now we can fit our test data with the same vectorizer
# tfidf_test = vectorizer.transform(IV_test)

# # https://www.kaggle.com/code/barishasdemir/classification-with-naive-bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(IV_train).toarray()
tfidf_test = tfidf_vectorizer.transform(IV_test).toarray()

model=GaussianNB()
model.fit(tfidf_train, DV_train) 

In [None]:
#prediction

y_pred = model.predict(tfidf_test)
y_prob = model.predict_proba(tfidf_test)[:,1]

### MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB

#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(IV_train) 
tfidf_test = tfidf_vectorizer.transform(IV_test)

model=MultinomialNB()
model.fit(tfidf_train, DV_train) 


In [None]:
#prediction

y_pred = model.predict(tfidf_test.todense())
y_prob = model.predict_proba(tfidf_test.todense())[:,1]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(y_pred, DV_test))
print("Precision : ", precision_score(y_pred, DV_test, average = 'weighted'))
print("Recall : ", recall_score(y_pred, DV_test, average = 'weighted'))
print("F1 Score : ", recall_score(y_pred, DV_test, average = 'weighted'))

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier 

#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(IV_train) 
tfidf_test = tfidf_vectorizer.transform(IV_test)

model=KNeighborsClassifier()
model.fit(tfidf_train, DV_train) 

In [None]:
#prediction

y_pred = model.predict(tfidf_test.todense())
y_prob = model.predict_proba(tfidf_test.todense())[:,1]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(y_pred, DV_test))
print("Precision : ", precision_score(y_pred, DV_test, average = 'weighted'))
print("Recall : ", recall_score(y_pred, DV_test, average = 'weighted'))
print("F1 Score : ", recall_score(y_pred, DV_test, average = 'weighted'))

### Deceion Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(IV_train) 
tfidf_test = tfidf_vectorizer.transform(IV_test)

model=DecisionTreeClassifier(random_state = 1)
model.fit(tfidf_train, DV_train) 

In [None]:
#prediction

y_pred = model.predict(tfidf_test.todense())
y_prob = model.predict_proba(tfidf_test.todense())[:,1]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(y_pred, DV_test))
print("Precision : ", precision_score(y_pred, DV_test, average = 'weighted'))
print("Recall : ", recall_score(y_pred, DV_test, average = 'weighted'))
print("F1 Score : ", recall_score(y_pred, DV_test, average = 'weighted'))

### Random Forest 

In [None]:
# https://www.kdnuggets.com/2022/01/explain-nlp-models-lime.html

from sklearn.ensemble import RandomForestClassifier

#TF-IDF
# Convert x_train to vector
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(IV_train) 
tfidf_test = tfidf_vectorizer.transform(IV_test)

model=RandomForestClassifier(n_estimators = 100, random_state = 10)
model.fit(tfidf_train, DV_train) 

In [None]:
#prediction

y_pred = model.predict(tfidf_test.todense())
y_prob = model.predict_proba(tfidf_test.todense())[:,1]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(y_pred, DV_test))
print("Precision : ", precision_score(y_pred, DV_test, average = 'weighted'))
print("Recall : ", recall_score(y_pred, DV_test, average = 'weighted'))
print("F1 Score : ", recall_score(y_pred, DV_test, average = 'weighted'))

In [None]:
#import lime
#import lime.lime_text
from lime_text import LimeTextExplainer

# from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=[0,1])

In [None]:
idx =64
df_train.iloc[idx]

In [None]:
from sklearn.pipeline import make_pipeline

# Explaining the predictions and important features for predicting the label 1
c = make_pipeline(tfidf_vectorizer, model)

In [None]:
c

In [None]:
IV_test.iloc[15]

In [None]:
idx =0

explainer = LimeTextExplainer(class_names=[1,-1])
# classifier_fn is the probability function that takes a string and returns prediction probabilities.
# num_features is the max. number of features we want in the explanation(default is 10).
# labels=(1,) means we want the explanation for the label 1
exp = explainer.explain_instance(IV_test.iloc[15], c.predict_proba, num_features=10)
exp.show_in_notebook()
