In [42]:
import spacy
from spacy import displacy

In [43]:
nlp = spacy.load('en_core_web_sm')

In [44]:
from spacy.lang.en.stop_words import STOP_WORDS

In [45]:
stopwords = list(STOP_WORDS)
print(len(stopwords))

326


### Entity detection

In [46]:
par = "National Assembly Majority Leader Amos Kimunya has said the main agenda for the upcoming Sagana 3 meeting is uniting the Mount Kenya Region and eliminating fake narratives about Presiddent Uhuru Kenyatta. The President is scheduled to meet Mount Kenya leaders at Sagana State Lodge, Nyeri."

In [47]:
doc = nlp(par)

In [48]:
doc

National Assembly Majority Leader Amos Kimunya has said the main agenda for the upcoming Sagana 3 meeting is uniting the Mount Kenya Region and eliminating fake narratives about Presiddent Uhuru Kenyatta. The President is scheduled to meet Mount Kenya leaders at Sagana State Lodge, Nyeri.

In [49]:
displacy.render(doc, style='ent')

 ### Text classification 

In [50]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [51]:
df_yelp = pd.read_csv('yelp_labelled.txt', sep='\t', header=None)
df_amazon = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)
df_imdb = pd.read_csv('imdb_labelled.txt', sep='\t', header=None)

data = df_yelp.append([df_amazon, df_imdb], ignore_index=True)
columns_names = ['Review', 'Sentiment']
data.columns = columns_names

In [52]:
data.shape

(2748, 2)

In [53]:
data.Sentiment.value_counts(normalize=True)

1    0.504367
0    0.495633
Name: Sentiment, dtype: float64

In [54]:
data.isna().sum()

Review       0
Sentiment    0
dtype: int64

### Tokenization

In [55]:
import string

In [56]:
punct = string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

#### data cleaning

In [57]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ !="-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
             cleaned_tokens.append(token)
    return cleaned_tokens

In [58]:
text_data_cleaning("   Hello, I hope you are doing okay")

['hello', 'hope', 'okay']

### Vectorization feature engineering (TF-IDF)

In [59]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [60]:
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
classifier = RandomForestClassifier()

In [61]:
X = data["Review"]
y = data["Sentiment"]

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [63]:
X_train.shape, X_test.shape

((2198,), (550,))

In [64]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

In [65]:
clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x000002696874CB80>)),
                ('clf', RandomForestClassifier())])

In [66]:
y_pred = clf.predict(X_test)

In [67]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.77      0.78       279
           1       0.77      0.78      0.78       271

    accuracy                           0.78       550
   macro avg       0.78      0.78      0.78       550
weighted avg       0.78      0.78      0.78       550



In [68]:
 confusion_matrix(y_test, y_pred)

array([[215,  64],
       [ 59, 212]], dtype=int64)

In [69]:
clf.predict(['You are so stupid'])

array([0], dtype=int64)

In [70]:
clf.predict(['I love you so much'])

array([1], dtype=int64)

In [71]:
clf.score(X_train, y_train)

0.9913557779799818

In [72]:
import pickle

In [73]:
pickle.dump(clf, open('clf.pkl', 'wb'))