In [1]:
import pandas as pd
import numpy as np
import json
import re
from nltk.corpus import stopwords
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_json("Data_cyb.json", lines = True, orient = "columns")

In [3]:
rating = []

for i in df["annotation"]:
    rating.append(i["label"][0])
    
df["rating"] = rating

In [4]:
new_df = df[["content", "rating"]]
new_df.tail()

Unnamed: 0,content,rating
19996,I dont. But what is complaining about it goi...,0
19997,Bahah yeah i&;m totally just gonna&; get pis...,0
19998,hahahahaha >:) im evil mwahahahahahahahahaha,0
19999,What&;s something unique about Ohio? :),0
20000,Who is the biggest gossiper you know?,0


In [5]:
new_df.to_csv("comments.csv")

In [6]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20001 entries, 0 to 20000
Data columns (total 2 columns):
content    20001 non-null object
rating     20001 non-null object
dtypes: object(2)
memory usage: 156.3+ KB


In [7]:
X, X_test, y, y_test = train_test_split(new_df["content"], new_df["rating"], train_size = 0.8)


In [8]:
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(X)
reviews_test_clean = preprocess_reviews(X_test)

In [9]:
baseline_vectorizer = CountVectorizer(binary=True)
baseline_vectorizer.fit(reviews_train_clean)
X_baseline = baseline_vectorizer.transform(reviews_train_clean)
X_test_baseline = baseline_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X_baseline, y, train_size = 0.80
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    


final_model = LogisticRegression(C=0.05)
final_model.fit(X_baseline, y)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_model.predict(X_test_baseline)))




Accuracy for C=0.01: 0.678125
Accuracy for C=0.05: 0.7253125
Accuracy for C=0.25: 0.773125
Accuracy for C=0.5: 0.800625
Accuracy for C=1: 0.819375
Final Accuracy: 0.7348162959260185


# Remove Stop Words
Removing Stop Words

Stop words are the very common words like ‘if’, ‘but’, ‘we’, ‘he’, ‘she’, and ‘they’. We can usually remove these 

words without changing the semantics of a text 

In [10]:
from nltk.corpus import stopwords

In [11]:
english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

In [12]:
no_stop_words_train = remove_stop_words(reviews_train_clean)
no_stop_words_test = remove_stop_words(reviews_test_clean)

cv = CountVectorizer(binary=True)
cv.fit(no_stop_words_train)
X = cv.transform(no_stop_words_train)
X_test = cv.transform(no_stop_words_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.67225
Accuracy for C=0.05: 0.7135
Accuracy for C=0.25: 0.76625
Accuracy for C=0.5: 0.78575
Accuracy for C=1: 0.80075


# Normalization
A common next step in text preprocessing is to normalize the words in your corpus by trying to convert all of the different forms of a given word into one. Two methods that exist for this are Stemming and Lemmatization.


# Stemming

In [13]:
def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()

    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews_train = get_stemmed_text(reviews_train_clean)
stemmed_reviews_test = get_stemmed_text(reviews_test_clean)

cv = CountVectorizer(binary=True)
cv.fit(stemmed_reviews_train)
X = cv.transform(stemmed_reviews_train)
X_test = cv.transform(stemmed_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
final_stemmed = LogisticRegression(C=0.05)
final_stemmed.fit(X, y)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_stemmed.predict(X_test)))



Accuracy for C=0.01: 0.67625
Accuracy for C=0.05: 0.71175
Accuracy for C=0.25: 0.76225
Accuracy for C=0.5: 0.784
Accuracy for C=1: 0.80325
Final Accuracy: 0.7260684828792802


# Lemmatization

In [14]:
def get_lemmatized_text(corpus):
    
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews_train = get_lemmatized_text(reviews_train_clean)
lemmatized_reviews_test = get_lemmatized_text(reviews_test_clean)

cv = CountVectorizer(binary=True)
cv.fit(lemmatized_reviews_train)
X = cv.transform(lemmatized_reviews_train)
X_test = cv.transform(lemmatized_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
final_lemmatized = LogisticRegression(C=0.25)
final_lemmatized.fit(X, y)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_lemmatized.predict(X_test)))



Accuracy for C=0.01: 0.67025
Accuracy for C=0.05: 0.71725
Accuracy for C=0.25: 0.76525
Accuracy for C=0.5: 0.78875
Accuracy for C=1: 0.811
Final Accuracy: 0.7883029242689328


# n-grams

Last time we used only single word features in our model, which we call 1-grams or unigrams. We can potentially add more predictive power to our model by adding two or three word sequences (bigrams or trigrams) as well. For example, if a review had the three word sequence “didn’t love movie” we would only consider these words individually with a unigram-only model and probably not capture that this is actually a negative sentiment because the word ‘love’ by itself is going to be highly correlated with a positive review.
The scikit-learn library makes this really easy to play around with. Just use the ngram_range argu

In [15]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 4))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
final_ngram = LogisticRegression(C=0.5)
final_ngram.fit(X, y)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_ngram.predict(X_test)))



Accuracy for C=0.01: 0.71525
Accuracy for C=0.05: 0.83875
Accuracy for C=0.25: 0.881
Accuracy for C=0.5: 0.8845
Accuracy for C=1: 0.886
Final Accuracy: 0.9215196200949762


# Word Counts

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

wc_vectorizer = CountVectorizer(binary=False)
wc_vectorizer.fit(reviews_train_clean)
X = wc_vectorizer.transform(reviews_train_clean)
X_test = wc_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75, 
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
final_wc = LogisticRegression(C=0.05)
final_wc.fit(X, y)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_wc.predict(X_test)))



Accuracy for C=0.01: 0.678
Accuracy for C=0.05: 0.7225
Accuracy for C=0.25: 0.77025
Accuracy for C=0.5: 0.79775
Accuracy for C=1: 0.819
Final Accuracy: 0.7353161709572607


# TF-IDF

Another common way to represent each document in a corpus is to use the tf-idf statistic (term frequency-inverse

document frequency) for each word, which is a weighting factor that we can use in place of binary or word count 

representations.

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(reviews_train_clean)
X = tfidf_vectorizer.transform(reviews_train_clean)
X_test = tfidf_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
final_tfidf = LogisticRegression(C=1)
final_tfidf.fit(X, y)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_tfidf.predict(X_test)))




Accuracy for C=0.01: 0.6135
Accuracy for C=0.05: 0.6465
Accuracy for C=0.25: 0.7125
Accuracy for C=0.5: 0.7395
Accuracy for C=1: 0.77
Final Accuracy: 0.7615596100974756


# Support Vector Machines (SVM)

Recall that linear classifiers tend to work well on very sparse datasets (like the one we have). Another algorithm that can produce great results with a quick training time are Support Vector Machines with a linear kernel.
Here’s an example with an n-gram range from 1 to 2:

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    
final_svm_ngram = LinearSVC(C=0.01)
final_svm_ngram.fit(X, y)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_svm_ngram.predict(X_test)))

Accuracy for C=0.01: 0.8095
Accuracy for C=0.05: 0.86125
Accuracy for C=0.25: 0.8645
Accuracy for C=0.5: 0.85875
Accuracy for C=1: 0.8565
Final Accuracy: 0.8492876780804799


# Final Model

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords

In [20]:
stop_words = stopwords.words('english')
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words)
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    
final = LinearSVC(C=0.01)
final.fit(X, y)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final.predict(X_test)))


Accuracy for C=0.001: 0.671
Accuracy for C=0.005: 0.78425
Accuracy for C=0.01: 0.82725
Accuracy for C=0.05: 0.86825
Accuracy for C=0.1: 0.8715
Final Accuracy: 0.8620344913771557


# Top Postitive and Negative Features

In [21]:
feature_to_coef = {
    word: coef for word, coef in zip(
        ngram_vectorizer.get_feature_names(), final.coef_[0]
    )
}

for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:30]:
    print (best_positive)
    
print("\n\n")
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:30]:
    print (best_negative)

('fuck', 0.8787641866758699)
('gay', 0.8419057410964496)
('bitch', 0.77037520834638)
('emo', 0.7551068186950715)
('ass', 0.6545746005201926)
('whore', 0.6313668130537389)
('fucking', 0.6266162312325702)
('loser', 0.5705732246663429)
('sucks', 0.5178521751092767)
('hate', 0.5031726916314133)
('nerd', 0.47548308157779007)
('damn', 0.45929804285039794)
('fat', 0.45203969845793257)
('dick', 0.4191568786133525)
('cunt', 0.4167525553702202)
('cock', 0.4149538312256424)
('pig', 0.3842750449770479)
('freak', 0.3602799010092929)
('ugly', 0.3556761473468795)
('slut', 0.35002384729719255)
('pussy', 0.33939139431988624)
('whale', 0.30748740224716953)
('fag', 0.28095818906311454)
('piss', 0.2382928021005454)
('wow', 0.2380059148415493)
('hoe', 0.22990806194681426)
('hate much', 0.22708841368656502)
('cum', 0.21643701329252374)
('retard', 0.21427174258282358)
('ur', 0.20050718382434954)



('ever', -0.29791585832532197)
('thanks', -0.25876360700986145)
('love', -0.225424829254146)
('nope', -0.217714

# Let's test this baby out!

In [27]:
tweets = pd.read_csv("twitter_comments.csv")
tweets.head()

Unnamed: 0,tweet,rating
0,Why is so much money sent to the Elijah Cummin...,1
1,....As proven last week during a Congressional...,1
2,"Rep, Elijah Cummings has been a brutal bully, ...",1
3,Wow! Big VICTORY on the Wall. The United State...,0
4,"Today, President Trump announced a safe third ...",0


In [40]:
X, X_test, y, y_test = train_test_split(tweets["tweet"], tweets["rating"], train_size = 0.8)


In [41]:
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(X)
reviews_test_clean = preprocess_reviews(X_test)

In [42]:
reviews_train_clean[5]

'@repcummings is a champion in the congress and the country for civil rights and economic justice a beloved leader in baltimore and deeply valued colleague we all reject racist attacks against him and support his steadfast leadership #elijahcummingsisapatriothttps  twittercom realdonaldtrump status \xa0…'

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))


Accuracy for C=0.01: 0.42857142857142855
Accuracy for C=0.05: 0.42857142857142855
Accuracy for C=0.25: 0.42857142857142855
Accuracy for C=0.5: 0.42857142857142855
Accuracy for C=1: 0.42857142857142855




In [45]:
stop_words = stopwords.words('english')
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words)
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    
final = LinearSVC(C=0.01)
final.fit(X, y)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final.predict(X_test)))

Accuracy for C=0.001: 0.5714285714285714
Accuracy for C=0.005: 0.5714285714285714
Accuracy for C=0.01: 0.5714285714285714
Accuracy for C=0.05: 0.5714285714285714
Accuracy for C=0.1: 0.7142857142857143
Final Accuracy: 0.7142857142857143


In [48]:
from sklearn.metrics import classification_report
predictions = final.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.71      1.00      0.83         5
           1       0.00      0.00      0.00         2

    accuracy                           0.71         7
   macro avg       0.36      0.50      0.42         7
weighted avg       0.51      0.71      0.60         7



  'precision', 'predicted', average, warn_for)
