In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import string
import nltk
import re
from pathlib import Path
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.naive_bayes import GaussianNB
from sklearn import datasets

In [2]:
#download nltk functions
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
#set pd options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)

In [4]:
#read in csv file
df_data = pd.read_csv("labeled_data.csv")

In [5]:
#view data frame
df_data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;


In [6]:
#delete unnamed column
df_data = df_data.drop(columns=['Unnamed: 0'])
df_data

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;
...,...,...,...,...,...,...
24778,3,0,2,1,1,"you's a muthaf***in lie &#8220;@LifeAsKing: @20_Pearls @corey_emanuel right! His TL is trash &#8230;. Now, mine? Bible scriptures and hymns&#8221;"
24779,3,0,1,2,2,"you've gone and broke the wrong heart baby, and drove me redneck crazy"
24780,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I aint fuckin dis up again
24781,6,0,6,0,1,youu got wild bitches tellin you lies


In [7]:
#view data types
df_data.dtypes

count                  int64
hate_speech            int64
offensive_language     int64
neither                int64
class                  int64
tweet                 object
dtype: object

In [8]:
#Remove @Usernames
tweets_usernames_removed=[]
row = 0
for i in df_data['tweet']:
    text = ' '.join(re.sub("([@#][A-Za-z0-9_]+)|(\w+:\/\/\S+)"," ", i).split())
    tweets_usernames_removed.append(text)

In [9]:
#Remove Retweet tag:  RT
tweets_sub=[]
from itertools import count
for i in range(len(tweets_usernames_removed)):
  results=re.sub(r"RT", " ", tweets_usernames_removed[i])
  tweets_sub.append(results)

In [10]:
#Remove punctuation
tweets_cleaned = []
regex = re.compile("[^a-zA-Z ]")
for i in range(len(tweets_sub)):
    re_clean = regex.sub('', tweets_sub[i])
    tweets_cleaned.append(re_clean)
                

In [11]:
#Print tweets cleaned list
print(tweets_cleaned)



In [12]:
#Assign new column labeled "cleaned_tweets" to original dataframe and populate with data from tweets_cleaned list
df_data_cleaned = df_data.assign(cleaned_tweets = tweets_cleaned)
df_data_cleaned

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,cleaned_tweets
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...,As a woman you shouldnt complain about cleaning up your house amp as a man you should always take the trash out
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!,boy dats coldtyga dwn bad for cuffin dat hoe in the st place
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit,Dawg You ever fuck a bitch and she start to cry You be confused as shit
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny,she look like a tranny
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;,The shit you hear about me might be true or it might be faker than the bitch who told it to ya
...,...,...,...,...,...,...,...
24778,3,0,2,1,1,"you's a muthaf***in lie &#8220;@LifeAsKing: @20_Pearls @corey_emanuel right! His TL is trash &#8230;. Now, mine? Bible scriptures and hymns&#8221;",yous a muthafin lie right His TL is trash Now mine Bible scriptures and hymns
24779,3,0,1,2,2,"you've gone and broke the wrong heart baby, and drove me redneck crazy",youve gone and broke the wrong heart baby and drove me redneck crazy
24780,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I aint fuckin dis up again,young buck wanna eat dat nigguh like I aint fuckin dis up again
24781,6,0,6,0,1,youu got wild bitches tellin you lies,youu got wild bitches tellin you lies


In [13]:
#drop tweet column from df_data_cleaned
df_data_cleaned = df_data_cleaned.drop(columns=['tweet'])
df_data_cleaned

Unnamed: 0,count,hate_speech,offensive_language,neither,class,cleaned_tweets
0,3,0,0,3,2,As a woman you shouldnt complain about cleaning up your house amp as a man you should always take the trash out
1,3,0,3,0,1,boy dats coldtyga dwn bad for cuffin dat hoe in the st place
2,3,0,3,0,1,Dawg You ever fuck a bitch and she start to cry You be confused as shit
3,3,0,2,1,1,she look like a tranny
4,6,0,6,0,1,The shit you hear about me might be true or it might be faker than the bitch who told it to ya
...,...,...,...,...,...,...
24778,3,0,2,1,1,yous a muthafin lie right His TL is trash Now mine Bible scriptures and hymns
24779,3,0,1,2,2,youve gone and broke the wrong heart baby and drove me redneck crazy
24780,3,0,3,0,1,young buck wanna eat dat nigguh like I aint fuckin dis up again
24781,6,0,6,0,1,youu got wild bitches tellin you lies


In [14]:
#Start Sentiment Section

In [15]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [16]:
# Analyze data in df_data_cleaned
all_lines = []
i = 0
for index, row in df_data_cleaned.iterrows():
    try:
        #sentiment scoring with VADER 
        line_sentiment = analyzer.polarity_scores(row["cleaned_tweets"])
        all_lines.append(line_sentiment)

    except AttributeError:
        pass
    
all_lines[0:2]

[{'neg': 0.0, 'neu': 0.9, 'pos': 0.1, 'compound': 0.2755},
 {'neg': 0.226, 'neu': 0.774, 'pos': 0.0, 'compound': -0.5423}]

In [17]:
#assign sentiment data to new data frame "sentiment_df"
sentiment_df = pd.DataFrame(all_lines)

#Join sentiment_df to df_data_cleaned
sentiment_df = sentiment_df.join(df_data_cleaned)
sentiment_df.head()

Unnamed: 0,neg,neu,pos,compound,count,hate_speech,offensive_language,neither,class,cleaned_tweets
0,0.0,0.9,0.1,0.2755,3,0,0,3,2,As a woman you shouldnt complain about cleaning up your house amp as a man you should always take the trash out
1,0.226,0.774,0.0,-0.5423,3,0,3,0,1,boy dats coldtyga dwn bad for cuffin dat hoe in the st place
2,0.62,0.38,0.0,-0.946,3,0,3,0,1,Dawg You ever fuck a bitch and she start to cry You be confused as shit
3,0.0,0.545,0.455,0.3612,3,0,2,1,1,she look like a tranny
4,0.253,0.651,0.096,-0.6808,6,0,6,0,1,The shit you hear about me might be true or it might be faker than the bitch who told it to ya


In [90]:
#create new data frame with only tweets with neg sentiment greater than .5 and class is hate speech
df_neg_tweets = sentiment_df.where(sentiment_df['neg']> 0.5).dropna()
df_neg_hate_tweets=df_neg_tweets.where(df_neg_tweets['class']== 0).dropna()
df_neg_hate_tweets

Unnamed: 0,neg,neu,pos,compound,count,hate_speech,offensive_language,neither,class,cleaned_tweets
260,0.720,0.280,0.000,-0.8271,3.0,2.0,1.0,0.0,0.0,Halloween was yesterday stupid nigger
582,0.686,0.314,0.000,-0.9460,3.0,3.0,0.0,0.0,0.0,We hate niggers we hate faggots and we hate spicskkk rally
614,0.593,0.407,0.000,-0.8074,3.0,2.0,1.0,0.0,0.0,You ol trout mouth ass bitch DEEEEAAAADD
647,0.756,0.244,0.000,-0.4767,3.0,2.0,1.0,0.0,0.0,poor whitey
776,0.504,0.496,0.000,-0.7269,3.0,2.0,1.0,0.0,0.0,Are you asian black Hawaiian gay retarded drunk
...,...,...,...,...,...,...,...,...,...,...
23923,0.636,0.364,0.000,-0.5423,3.0,2.0,1.0,0.0,0.0,hoe ass nigga
23982,0.787,0.213,0.000,-0.5719,3.0,2.0,1.0,0.0,0.0,i hate hoes
24085,0.692,0.308,0.000,-0.8750,3.0,3.0,0.0,0.0,0.0,inb you unfunny nigger kill yourself
24455,0.561,0.187,0.252,-0.5106,3.0,2.0,0.0,1.0,0.0,someones clearly a stinky ass beaner


In [88]:
#create new data frame with only tweets with neg sentiment greater than .5 and class is offensive speech
df_neg_offensive_tweets=df_neg_tweets.where(df_neg_tweets['class']== 1).dropna()
df_neg_offensive_tweets

Unnamed: 0,neg,neu,pos,compound,count,hate_speech,offensive_language,neither,class,cleaned_tweets
2,0.620,0.380,0.000,-0.9460,3.0,0.0,3.0,0.0,1.0,Dawg You ever fuck a bitch and she start to cry You be confused as shit
9,0.677,0.323,0.000,-0.7430,3.0,1.0,2.0,0.0,1.0,hobbies include fighting Mariam bitch
15,0.574,0.426,0.000,-0.6597,3.0,0.0,3.0,0.0,1.0,bitch nigga miss me with it
16,0.623,0.164,0.213,-0.5423,3.0,0.0,3.0,0.0,1.0,bitch plz whatever
18,0.667,0.333,0.000,-0.7184,3.0,0.0,3.0,0.0,1.0,bitches get cut off everyday B
...,...,...,...,...,...,...,...,...,...,...
24761,0.508,0.492,0.000,-0.4767,3.0,0.0,3.0,0.0,1.0,you fake niggah lolol
24762,0.612,0.388,0.000,-0.7430,3.0,1.0,2.0,0.0,1.0,you got niggas and i got bitches
24764,0.504,0.496,0.000,-0.7269,3.0,0.0,3.0,0.0,1.0,you gotta understand that these bitches are childish
24773,0.515,0.485,0.000,-0.6486,3.0,1.0,2.0,0.0,1.0,you niggers cheat on ya gfs smh


In [91]:
#create new data frame with only tweets with pos sentiment greater than .5
# df_pos_tweets = sentiment_df.where(sentiment_df['pos']> 0.6).dropna()
# df_pos_tweets['class'].value_counts()

In [98]:
#create new data frame with only tweets with neu sentiment greater than .5 and class is neither
df_neu_tweets = sentiment_df.where(sentiment_df['neu']> 0.6).dropna()
df_neu_neither_tweets=df_neu_tweets.where(df_neu_tweets['class']==2).dropna()
df_neu_neither_tweets

Unnamed: 0,neg,neu,pos,compound,count,hate_speech,offensive_language,neither,class,cleaned_tweets
0,0.000,0.900,0.100,0.2755,3.0,0.0,0.0,3.0,2.0,As a woman you shouldnt complain about cleaning up your house amp as a man you should always take the trash out
40,0.239,0.761,0.000,-0.2960,3.0,0.0,1.0,2.0,2.0,momma said no pussy cats inside my doghouse
63,0.000,1.000,0.000,0.0000,3.0,0.0,0.0,3.0,2.0,SimplyAddictedToGuys woof woof hot scally lad
66,0.000,1.000,0.000,0.0000,3.0,0.0,1.0,2.0,2.0,woof woof and hot soles
67,0.000,0.763,0.237,0.4215,3.0,0.0,1.0,2.0,2.0,Lemmie eat a Oreo amp do these dishes One oreo Lol
...,...,...,...,...,...,...,...,...,...,...
24709,0.000,0.769,0.231,0.4019,3.0,0.0,0.0,3.0,2.0,wish I had a brownie or a dab to knock me out rn
24714,0.000,0.802,0.198,0.6369,3.0,0.0,0.0,3.0,2.0,wondertrade is the best pokemon feature ever i just got a level japanese wobbuffet for a level beginning area trash pokemon
24736,0.155,0.634,0.211,0.2023,3.0,0.0,0.0,3.0,2.0,yaya ho cute avi tho I had no idea she was sleep
24767,0.000,1.000,0.000,0.0000,3.0,0.0,1.0,2.0,2.0,you know what they say the early bird gets the worm puts gummy worms in your morning coffee


In [20]:
###End Sentiment Section

In [21]:
# Define a new lemmatizer
lemmatizer = WordNetLemmatizer()

# Define preprocess function: removes stop words, tokenize, and lemmatize the data passed in
def process_text(data):
    # sw = set(stopwords.words('english'))
    #regex = re.compile("[^a-zA-Z ]")
    #re_clean = regex.sub('', data)
    words = word_tokenize(data)
    # lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in words]
              # if word.lower() not in sw]
    return output

In [96]:
#Loop through df_neg_hate_tweets['cleaned_tweets'] column and append data from each row to neg_hate_tweets string
neg_hate_tweets=""

for i in df_neg_hate_tweets['cleaned_tweets']:
    neg_hate_tweets+=i

In [97]:
#Loop through df_neg_tweets['cleaned_tweets'] column and append data from each row to neg_offensive_tweets string
neg_offensive_tweets=""

for i in df_neg_offensive_tweets['cleaned_tweets']:
    neg_offensive_tweets+=i

In [99]:
#Loop through df_pos_tweets['cleaned_tweets'] column and append data from each row to neu_neither_tweets string
neu_neither_tweets=""

for i in df_neu_neither_tweets['cleaned_tweets']:
    neu_neither_tweets+=i

In [101]:
#Pass neg_hate_tweets to process_text function and store return in processed_neg_hate_tweets
processed_neg_hate_tweets = process_text(neg_hate_tweets)
print(processed_neg_hate_tweets)

['halloween', 'was', 'yesterday', 'stupid', 'niggerwe', 'hate', 'niggers', 'we', 'hate', 'faggots', 'and', 'we', 'hate', 'spicskkk', 'rallyyou', 'ol', 'trout', 'mouth', 'ass', 'bitch', 'deeeeaaaaddpoor', 'whiteyare', 'you', 'asian', 'black', 'hawaiian', 'gay', 'retarded', 'drunkreferred', 'to', 'zimmerman', 'as', 'a', 'creepy', 'ass', 'cracker', 'racist', 'thug', 'racist', 'all', 'month', 'fuck', 'u', 'honkey', 'tea', 'bag', 'a', 'bitch', 'pahahaha', 'fucking', 'white', 'people', 'bitch', 'wut', 'shoot', 'that', 'nigga', 'and', 'his', 'shorty', 'bitch', 'lem', 'me', 'find', 'out', 'you', 'niggaz', 'letting', 'bitches', 'shit', 'on', 'yo', 'faces', 'niggas', 'is', 'weirdos', 'bitch', 'ass', 'nigga', 'fuck', 'you', 'lt', 'these', 'hoes', 'aint', 'got', 'no', 'mannersfaggoti', 'hate', 'that', 'niggerthis', 'is', 'retardedhomo', 'ass', 'crackersape', 'fuck', 'u', 'broke', 'ass', 'racistanswer', 'my', 'snapchat', 'faggotpass', 'the', 'fosters', 'faggotyea', 'u', 'stupid', 'faggot', 'im', 'n

In [103]:
#Pass neg_offesnive_tweets to process_text function and store return in processed_neg_offesnive_tweets
processed_neg_offensive_tweets = process_text(neg_offensive_tweets)
print(processed_neg_offensive_tweets)

['dawg', 'you', 'ever', 'fuck', 'a', 'bitch', 'and', 'she', 'start', 'to', 'cry', 'you', 'be', 'confused', 'as', 'shit', 'hobbies', 'include', 'fighting', 'mariam', 'bitch', 'bitch', 'nigga', 'miss', 'me', 'with', 'it', 'bitch', 'plz', 'whatever', 'bitches', 'get', 'cut', 'off', 'everyday', 'b', 'black', 'bottle', 'amp', 'a', 'bad', 'bitch', 'broke', 'bitch', 'cant', 'tell', 'me', 'nothing', 'cancel', 'that', 'bitch', 'like', 'nino', 'i', 'need', 'a', 'trippy', 'bitch', 'who', 'fuck', 'on', 'hennessy', 'i', 'txt', 'my', 'old', 'bitch', 'my', 'new', 'bitch', 'pussy', 'wetter', 'is', 'that', 'ya', 'bitch', 'jus', 'meet', 'son', 'now', 'he', 'ya', 'mane', 'ass', 'bitches', 'lames', 'crying', 'over', 'hoes', 'thats', 'tears', 'of', 'a', 'clown', 'bitch', 'fuck', 'u', 'maybe', 'youll', 'get', 'better', 'just', 'bitches', 'be', 'wanting', 'to', 'act', 'like', 'niggas', 'so', 'bad', 'that', 'shit', 'aint', 'cuteeeee', 'but', 'niggas', 'act', 'like', 'bitches', 'faggot', 'read', 'my', 'tweets'

In [104]:
#Pass neu_neither_tweets to process_text function and store return in processed_neu_neither_tweets
processed_neu_neither_tweets = process_text(neu_neither_tweets)
print(processed_neu_neither_tweets)



In [107]:
#count items in processed_neg_tweets
neg_counts = Counter(processed_neg_hate_tweets)
neg_counts

Counter({'halloween': 1,
         'was': 4,
         'yesterday': 1,
         'stupid': 15,
         'niggerwe': 1,
         'hate': 20,
         'niggers': 1,
         'we': 5,
         'faggots': 10,
         'and': 19,
         'spicskkk': 1,
         'rallyyou': 1,
         'ol': 2,
         'trout': 1,
         'mouth': 3,
         'ass': 55,
         'bitch': 51,
         'deeeeaaaaddpoor': 1,
         'whiteyare': 1,
         'you': 61,
         'asian': 1,
         'black': 4,
         'hawaiian': 1,
         'gay': 2,
         'retarded': 5,
         'drunkreferred': 1,
         'to': 18,
         'zimmerman': 1,
         'as': 1,
         'a': 94,
         'creepy': 2,
         'cracker': 1,
         'racist': 9,
         'thug': 1,
         'all': 8,
         'month': 1,
         'fuck': 25,
         'u': 17,
         'honkey': 1,
         'tea': 1,
         'bag': 2,
         'pahahaha': 1,
         'fucking': 25,
         'white': 9,
         'people': 3,
         'wut': 1

In [109]:
#count items in processed_neg_offesnive_tweets
neg_offensive_counts = Counter(processed_neg_offensive_tweets)
neg_offensive_counts

Counter({'dawg': 6,
         'you': 433,
         'ever': 11,
         'fuck': 336,
         'a': 947,
         'bitch': 1524,
         'and': 180,
         'she': 53,
         'start': 13,
         'to': 229,
         'cry': 7,
         'be': 153,
         'confused': 4,
         'as': 56,
         'shit': 181,
         'hobbies': 1,
         'include': 1,
         'fighting': 2,
         'mariam': 1,
         'nigga': 100,
         'miss': 19,
         'me': 181,
         'with': 134,
         'it': 64,
         'plz': 1,
         'whatever': 2,
         'bitches': 619,
         'get': 92,
         'cut': 16,
         'off': 30,
         'everyday': 5,
         'b': 13,
         'black': 10,
         'bottle': 1,
         'amp': 45,
         'bad': 183,
         'broke': 38,
         'cant': 40,
         'tell': 10,
         'nothing': 7,
         'cancel': 2,
         'that': 281,
         'like': 146,
         'nino': 2,
         'i': 523,
         'need': 37,
         'trippy': 2,

In [27]:
#count items in processed_pos_tweets
pos_counts = Counter(processed_pos_tweets)
pos_counts

Counter({'i': 28,
         'aint': 5,
         'shit': 4,
         'damn': 1,
         'skippy': 1,
         'lol': 18,
         'happy': 3,
         'birthday': 6,
         'nigs': 1,
         'thanks': 3,
         'yo': 1,
         'pussy': 10,
         'love': 19,
         'it': 5,
         'helping': 1,
         'out': 2,
         'a': 41,
         'mate': 1,
         'scally': 1,
         'play': 1,
         'fuck': 2,
         'yeahlmao': 1,
         'pound': 1,
         'cake': 1,
         'look': 1,
         'like': 13,
         'straight': 1,
         'nation': 1,
         'soccer': 1,
         'champions': 1,
         'wow': 1,
         'pineda': 1,
         'needed': 2,
         'that': 10,
         'great': 3,
         'playwishing': 1,
         'loyal': 3,
         'bitch': 28,
         'find': 1,
         'ho': 1,
         'the': 7,
         'hoes': 8,
         'thats': 2,
         'awesome': 1,
         'prolly': 1,
         'taste': 1,
         'heaven': 1,
         'gr

In [28]:
#count negative tweets and store in new dataframe
df_neg_tweet_counts = pd.DataFrame.from_dict(neg_counts.items(), orient='columns')
df_neg_tweet_counts.rename(columns={0: "word", 1: "count"}, inplace =True)
df_neg_tweet_counts.head()

Unnamed: 0,word,count
0,dawg,6
1,you,502
2,ever,11
3,fuck,362
4,a,1054


In [29]:
#count postive tweets and store in new dataframe
df_pos_tweet_counts = pd.DataFrame.from_dict(pos_counts.items(), orient='columns')
df_pos_tweet_counts.rename(columns={0: "word", 1: "count"}, inplace =True)
df_pos_tweet_counts.head()

Unnamed: 0,word,count
0,i,28
1,aint,5
2,shit,4
3,damn,1
4,skippy,1


In [30]:
#sort negative tweet counts and store in new dataframe
df_neg_tweet_counts_sorted = df_neg_tweet_counts.sort_values(by='count', ascending=False).reset_index()
df_neg_tweet_counts_sorted.head(200)

Unnamed: 0,index,word,count
0,5,bitch,1574
1,4,a,1054
2,26,bitches,631
3,44,i,541
4,64,ass,540
5,1,you,502
6,3,fuck,362
7,103,the,330
8,41,that,305
9,9,to,250


In [31]:
#sort positive tweet counts and store in new dataframe
df_pos_tweet_counts_sorted= df_pos_tweet_counts.sort_values(by='count', ascending=False).reset_index()
df_pos_tweet_counts_sorted.head(200)

Unnamed: 0,index,word,count
0,16,a,41
1,0,i,28
2,37,bitch,28
3,12,love,19
4,5,lol,18
5,25,like,13
6,54,good,12
7,116,dont,11
8,11,pussy,10
9,33,that,10


In [32]:
print(dict(neg_counts))

{'dawg': 6, 'you': 502, 'ever': 11, 'fuck': 362, 'a': 1054, 'bitch': 1574, 'and': 203, 'she': 55, 'start': 14, 'to': 250, 'cry': 7, 'be': 166, 'confused': 4, 'as': 57, 'shit': 194, 'hobbies': 1, 'include': 1, 'fighting': 2, 'mariam': 1, 'nigga': 126, 'miss': 19, 'me': 196, 'with': 143, 'it': 68, 'plz': 2, 'whatever': 2, 'bitches': 631, 'get': 97, 'cut': 16, 'off': 35, 'everyday': 5, 'b': 13, 'black': 14, 'bottle': 1, 'amp': 47, 'bad': 189, 'broke': 41, 'cant': 41, 'tell': 12, 'nothing': 7, 'cancel': 2, 'that': 305, 'like': 155, 'nino': 2, 'i': 541, 'need': 38, 'trippy': 2, 'who': 43, 'on': 161, 'hennessy': 1, 'txt': 1, 'my': 238, 'old': 10, 'new': 16, 'pussy': 55, 'wetter': 1, 'is': 240, 'ya': 43, 'jus': 5, 'meet': 4, 'son': 11, 'now': 25, 'he': 39, 'mane': 4, 'ass': 540, 'lames': 1, 'crying': 13, 'over': 15, 'hoes': 75, 'thats': 44, 'tears': 1, 'of': 101, 'clown': 3, 'u': 156, 'maybe': 3, 'youll': 2, 'better': 10, 'just': 74, 'wanting': 3, 'act': 7, 'niggas': 147, 'so': 90, 'aint': 90

In [33]:
print(dict(neg_counts.most_common(10)))

{'bitch': 1574, 'a': 1054, 'bitches': 631, 'i': 541, 'ass': 540, 'you': 502, 'fuck': 362, 'the': 330, 'that': 305, 'to': 250}


In [34]:
print(dict(pos_counts))

{'i': 28, 'aint': 5, 'shit': 4, 'damn': 1, 'skippy': 1, 'lol': 18, 'happy': 3, 'birthday': 6, 'nigs': 1, 'thanks': 3, 'yo': 1, 'pussy': 10, 'love': 19, 'it': 5, 'helping': 1, 'out': 2, 'a': 41, 'mate': 1, 'scally': 1, 'play': 1, 'fuck': 2, 'yeahlmao': 1, 'pound': 1, 'cake': 1, 'look': 1, 'like': 13, 'straight': 1, 'nation': 1, 'soccer': 1, 'champions': 1, 'wow': 1, 'pineda': 1, 'needed': 2, 'that': 10, 'great': 3, 'playwishing': 1, 'loyal': 3, 'bitch': 28, 'find': 1, 'ho': 1, 'the': 7, 'hoes': 8, 'thats': 2, 'awesome': 1, 'prolly': 1, 'taste': 1, 'heaven': 1, 'grand': 8, 'opening': 8, 'top': 8, 'class': 8, 'oriental': 8, 'massage': 8, 'perfect': 9, 'good': 12, 'girl': 3, 'is': 5, 'worth': 1, 'gt': 2, 'oo': 1, 'hoeshick': 1, 'yesgod': 1, 'still': 1, 'share': 1, 'his': 1, 'amos': 2, 'blessingslol': 1, 'cause': 1, 'these': 3, 'mecongrats': 1, 'lucky': 1, 'monkeythanks': 2, 'hun': 2, 'hahayeah': 1, 'niggah': 4, 'played': 2, 'lowkey': 1, 'wanted': 1, 'brazil': 1, 'to': 1, 'wini': 1, 'cant':

In [35]:
print(dict(pos_counts.most_common(10)))

{'a': 41, 'i': 28, 'bitch': 28, 'love': 19, 'lol': 18, 'like': 13, 'good': 12, 'dont': 11, 'pussy': 10, 'that': 10}


In [112]:
#Ngram for negative hate tweets
ngram_hate_counts = Counter(ngrams(processed_neg_hate_tweets, n=3))
print(dict(ngram_hate_counts))

{('halloween', 'was', 'yesterday'): 1, ('was', 'yesterday', 'stupid'): 1, ('yesterday', 'stupid', 'niggerwe'): 1, ('stupid', 'niggerwe', 'hate'): 1, ('niggerwe', 'hate', 'niggers'): 1, ('hate', 'niggers', 'we'): 1, ('niggers', 'we', 'hate'): 1, ('we', 'hate', 'faggots'): 1, ('hate', 'faggots', 'and'): 1, ('faggots', 'and', 'we'): 1, ('and', 'we', 'hate'): 1, ('we', 'hate', 'spicskkk'): 1, ('hate', 'spicskkk', 'rallyyou'): 1, ('spicskkk', 'rallyyou', 'ol'): 1, ('rallyyou', 'ol', 'trout'): 1, ('ol', 'trout', 'mouth'): 1, ('trout', 'mouth', 'ass'): 1, ('mouth', 'ass', 'bitch'): 1, ('ass', 'bitch', 'deeeeaaaaddpoor'): 1, ('bitch', 'deeeeaaaaddpoor', 'whiteyare'): 1, ('deeeeaaaaddpoor', 'whiteyare', 'you'): 1, ('whiteyare', 'you', 'asian'): 1, ('you', 'asian', 'black'): 1, ('asian', 'black', 'hawaiian'): 1, ('black', 'hawaiian', 'gay'): 1, ('hawaiian', 'gay', 'retarded'): 1, ('gay', 'retarded', 'drunkreferred'): 1, ('retarded', 'drunkreferred', 'to'): 1, ('drunkreferred', 'to', 'zimmerman')

In [117]:
print(dict(ngram_hate_counts.most_common(20)))

{('bitch', 'ass', 'nigga'): 4, ('his', 'shorty', 'bitch'): 2, ('the', 'fuck', 'up'): 2, ('you', 'a', 'nigger'): 2, ('a', 'lame', 'nigga'): 2, ('you', 'a', 'lame'): 2, ('more', 'than', 'bitches'): 2, ('than', 'bitches', 'these'): 2, ('the', 'next', 'nigga'): 2, ('next', 'nigga', 'bitch'): 2, ('ass', 'nigga', 'i'): 2, ('nigga', 'i', 'hate'): 2, ('for', 'me', 'they'): 2, ('halloween', 'was', 'yesterday'): 1, ('was', 'yesterday', 'stupid'): 1, ('yesterday', 'stupid', 'niggerwe'): 1, ('stupid', 'niggerwe', 'hate'): 1, ('niggerwe', 'hate', 'niggers'): 1, ('hate', 'niggers', 'we'): 1, ('niggers', 'we', 'hate'): 1}


In [115]:
#Ngram for negative offensive tweets
ngram_offensive_counts = Counter(ngrams(processed_neg_offensive_tweets, n=3))
print(dict(ngram_offensive_counts))

{('dawg', 'you', 'ever'): 1, ('you', 'ever', 'fuck'): 1, ('ever', 'fuck', 'a'): 1, ('fuck', 'a', 'bitch'): 8, ('a', 'bitch', 'and'): 5, ('bitch', 'and', 'she'): 1, ('and', 'she', 'start'): 1, ('she', 'start', 'to'): 1, ('start', 'to', 'cry'): 1, ('to', 'cry', 'you'): 1, ('cry', 'you', 'be'): 1, ('you', 'be', 'confused'): 1, ('be', 'confused', 'as'): 1, ('confused', 'as', 'shit'): 1, ('as', 'shit', 'hobbies'): 1, ('shit', 'hobbies', 'include'): 1, ('hobbies', 'include', 'fighting'): 1, ('include', 'fighting', 'mariam'): 1, ('fighting', 'mariam', 'bitch'): 1, ('mariam', 'bitch', 'bitch'): 1, ('bitch', 'bitch', 'nigga'): 1, ('bitch', 'nigga', 'miss'): 1, ('nigga', 'miss', 'me'): 1, ('miss', 'me', 'with'): 1, ('me', 'with', 'it'): 1, ('with', 'it', 'bitch'): 1, ('it', 'bitch', 'plz'): 1, ('bitch', 'plz', 'whatever'): 1, ('plz', 'whatever', 'bitches'): 1, ('whatever', 'bitches', 'get'): 1, ('bitches', 'get', 'cut'): 1, ('get', 'cut', 'off'): 1, ('cut', 'off', 'everyday'): 1, ('off', 'everyd

In [118]:
print(dict(ngram_offensive_counts.most_common(20)))

{('a', 'bad', 'bitch'): 35, ('i', 'hate', 'a'): 20, ('ass', 'bitch', 'i'): 20, ('bitch', 'ass', 'nigga'): 18, ('bitch', 'i', 'aint'): 17, ('a', 'bitch', 'that'): 16, ('a', 'bitch', 'i'): 14, ('stupid', 'ass', 'bitch'): 13, ('shut', 'the', 'fuck'): 13, ('cunt', 'cunt', 'cunt'): 12, ('in', 'this', 'bitch'): 11, ('i', 'hate', 'when'): 10, ('hate', 'a', 'bitch'): 10, ('i', 'got', 'bitches'): 10, ('a', 'bitch', 'aint'): 10, ('the', 'fuck', 'up'): 10, ('bitch', 'i', 'dont'): 10, ('a', 'dumb', 'bitch'): 10, ('your', 'bitch', 'ass'): 9, ('bitch', 'i', 'hate'): 9}


In [116]:
#Ngram for neither tweets
ngram_neither_counts = Counter(ngrams(processed_neu_neither_tweets, n=3))
print(dict(ngram_neither_counts))



In [120]:
print(dict(ngram_neither_counts.most_common(20)))

{('in', 'the', 'trash'): 16, ('planet', 'of', 'the'): 16, ('of', 'the', 'apes'): 16, ('a', 'lot', 'of'): 15, ('to', 'be', 'a'): 11, ('the', 'trash', 'can'): 10, ('look', 'like', 'a'): 10, ('bird', 'gets', 'the'): 9, ('the', 'planet', 'of'): 9, ('mans', 'trash', 'is'): 8, ('is', 'for', 'the'): 8, ('got', 'ta', 'be'): 8, ('gon', 'na', 'be'): 8, ('gets', 'the', 'worm'): 7, ('for', 'a', 'new'): 7, ('out', 'the', 'trash'): 7, ('you', 'are', 'a'): 7, ('of', 'a', 'feather'): 7, ('in', 'front', 'of'): 7, ('at', 'yankee', 'stadium'): 7}


In [121]:
df_neg_hate_tweet_phrases = pd.DataFrame.from_dict(ngram_hate_counts.items(), orient='columns')
df_neg_hate_tweet_phrases.rename(columns={0: "3_word_phrase", 1: "count"}, inplace =True)
df_neg_hate_tweet_phrases.head()

Unnamed: 0,3_word_phrase,count
0,"(halloween, was, yesterday)",1
1,"(was, yesterday, stupid)",1
2,"(yesterday, stupid, niggerwe)",1
3,"(stupid, niggerwe, hate)",1
4,"(niggerwe, hate, niggers)",1


In [129]:
df_neg_hate_tweet_phrases_sorted = df_neg_hate_tweet_phrases.sort_values(by='count', ascending=False)
df_neg_hate_tweet_phrases_sorted.head()

Unnamed: 0,3_word_phrase,count
76,"(bitch, ass, nigga)",4
58,"(his, shorty, bitch)",2
199,"(the, fuck, up)",2
926,"(a, lame, nigga)",2
1591,"(nigga, i, hate)",2


In [127]:
df_ngram_offensive_counts_phrases = pd.DataFrame.from_dict(ngram_offensive_counts.items(), orient='columns')
df_ngram_offensive_counts_phrases.rename(columns={0: "3_word_phrase", 1: "count"}, inplace =True)
df_ngram_offensive_counts_phrases.head()

Unnamed: 0,3_word_phrase,count
0,"(dawg, you, ever)",1
1,"(you, ever, fuck)",1
2,"(ever, fuck, a)",1
3,"(fuck, a, bitch)",8
4,"(a, bitch, and)",5


In [125]:
df_ngram_offensive_counts_phrases_sorted = df_ngram_offensive_counts_phrases.sort_values(by='count', ascending=False)
df_ngram_offensive_counts_phrases_sorted.head()

Unnamed: 0,4_word_phrase,count
39,"(a, bad, bitch)",35
1099,"(ass, bitch, i)",20
153,"(i, hate, a)",20
173,"(bitch, ass, nigga)",18
1100,"(bitch, i, aint)",17


In [128]:
df_ngram_neither_counts_phrases = pd.DataFrame.from_dict(ngram_neither_counts.items(), orient='columns')
df_ngram_neither_counts_phrases.rename(columns={0: "3_word_phrase", 1: "count"}, inplace =True)
df_ngram_neither_counts_phrases.head()

Unnamed: 0,3_word_phrase,count
0,"(as, a, woman)",1
1,"(a, woman, you)",1
2,"(woman, you, shouldnt)",1
3,"(you, shouldnt, complain)",1
4,"(shouldnt, complain, about)",1


In [130]:
df_ngram_neither_counts_phrases_sorted = df_ngram_neither_counts_phrases.sort_values(by='count', ascending=False)
df_ngram_neither_counts_phrases_sorted.head()

Unnamed: 0,3_word_phrase,count
2126,"(in, the, trash)",16
9917,"(planet, of, the)",16
9918,"(of, the, apes)",16
1091,"(a, lot, of)",15
3282,"(to, be, a)",11


In [67]:
#### START NAIVE BAYES ANALYSIS

In [68]:
# Assigning features and label variables
weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
'Rainy','Sunny','Overcast','Overcast','Rainy']
temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']

play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']


In [69]:
#creating labelEncoder
le = preprocessing.LabelEncoder()

# Converting string labels into numbers.
weather_encoded=le.fit_transform(weather)
print(weather_encoded)


[2 2 0 1 1 1 0 2 2 1 2 0 0 1]


In [70]:
# Converting string labels into numbers
temp_encoded=le.fit_transform(temp)
label=le.fit_transform(play)
print("Temp:",temp_encoded)
print("Play:",label)

Temp: [1 1 1 2 0 0 0 2 0 2 2 2 1 2]
Play: [0 0 1 1 1 0 1 0 1 1 1 1 1 0]


In [48]:
#Combinig weather and temp into single listof tuples
features=list(zip(weather_encoded,temp_encoded))


In [49]:
print(features)

[(2, 1), (2, 1), (0, 1), (1, 2), (1, 0), (1, 0), (0, 0), (2, 2), (2, 0), (1, 2), (2, 2), (0, 2), (0, 1), (1, 2)]


In [50]:
#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(features,label)

#Predict Output
predicted= model.predict([[0,2]]) # 0:Overcast, 2:Mild
print("Predicted Value:", predicted)


Predicted Value: [1]


In [51]:
# Testing classifier on a dataset

In [52]:
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
wine = datasets.load_wine()

In [53]:
# print the names of the 13 features
print("Features: ", wine.feature_names)

# print the label type of wine(class_0, class_1, class_2)
print("Labels: ", wine.target_names)

Features:  ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
Labels:  ['class_0' 'class_1' 'class_2']


In [54]:
# print data(feature)shape
wine.data.shape


(178, 13)

In [55]:
# print the wine data features (top 5 records)
print(wine.data[0:5])


[[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
  2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 1.120e+01 1.000e+02 2.650e+00 2.760e+00
  2.600e-01 1.280e+00 4.380e+00 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 1.860e+01 1.010e+02 2.800e+00 3.240e+00
  3.000e-01 2.810e+00 5.680e+00 1.030e+00 3.170e+00 1.185e+03]
 [1.437e+01 1.950e+00 2.500e+00 1.680e+01 1.130e+02 3.850e+00 3.490e+00
  2.400e-01 2.180e+00 7.800e+00 8.600e-01 3.450e+00 1.480e+03]
 [1.324e+01 2.590e+00 2.870e+00 2.100e+01 1.180e+02 2.800e+00 2.690e+00
  3.900e-01 1.820e+00 4.320e+00 1.040e+00 2.930e+00 7.350e+02]]


In [56]:
# print the wine labels (0:Class_0, 1:class_2, 2:class_2)
print(wine.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [57]:
# Import train_test_split function
from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.3,random_state=109) # 70% training and 30% test


In [58]:
#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test)


In [59]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.9074074074074074


In [60]:
#### END NAIVE BAYES ANALYSIS