# Machine Learning Challenge
The Rowdy Coderunners: Lily He, Richard Tarbell, Jenna Wallace

### Step 1: Import the training and test data

In [1]:
import csv
import numpy as np


X_txt_train = []
y_txt_train = []

with open('./train.tsv', encoding="utf8") as myTrainFile:
    train = csv.reader(myTrainFile, delimiter='\t', quoting = csv.QUOTE_NONE)
    for i in train:
        X_txt_train.append(i[1])
        y_txt_train.append(i[2])


        
X_txt_test = []
y_txt_test = []
ID_txt_test = []

with open('./test.tsv', encoding="utf8") as myTestFile:
    test = csv.reader(myTestFile, delimiter='\t', quoting = csv.QUOTE_NONE)
    for i in test:
        ID_txt_test.append(i[0])
        X_txt_test.append(i[1])
        y_txt_test.append(i[2])
        


In [2]:
# make sure the data is read correctly
print(X_txt_train[0])
print(y_txt_train[0])
print(len(X_txt_train), len(y_txt_train))

@USER She should ask a few native Americans what their take on this is.
UNT
10592 10592


In [3]:
print(X_txt_test[0])
print(y_txt_test[0])
print(len(X_txt_test), len(y_txt_test))

@USER Nancy Lee Grahn You Are Awesome! I have been a fan since Santa Barbara!! Alex Davis also Rocks!!!!! Thank you !!!
NOT
2648 2648


### Step 2: Convert the lists to arrays

In [4]:
X_txt_train = np.array(X_txt_train)
y_txt_train = np.array(y_txt_train)

In [5]:
X_txt_test = np.array(X_txt_test)
y_txt_test = np.array(y_txt_test)

### Step 3: Explore best vectorizing parameters
Run a pipeline to determine best parameters for vectorizing (n-grams, df, whether to include stopwords).

In [6]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2

from warnings import filterwarnings
filterwarnings('ignore') #turning off convergence warnings
    
import numpy as np
np.random.seed(1)
import random
random.seed(1)

pipeline = Pipeline([('vec', CountVectorizer()), 
                     ('skb', SelectKBest()),
                     ('clf', LinearSVC(random_state=1))])
params = {'vec__ngram_range':[(1,1),(1,2)],
          'vec__min_df':(1,2,4,5),
          'vec__stop_words':['english', 'None'],
          'skb__k':[10,500, 1000, 5000,'all'],
         'clf__C':[0.01, 0.1,1, 10, 100, 1000]}

initX_train, initX_test, inity_train, inity_test = train_test_split(X_txt_train, y_txt_train, test_size=0.2, random_state=1)

clf = GridSearchCV(pipeline, params, cv=5, scoring="f1_macro")

clf.fit(initX_train, inity_train)

preds = clf.predict(initX_test)

print('Best score:', clf.best_score_)
print('Best params:', clf.best_params_)

Best score: 0.4854342905510299
Best params: {'clf__C': 1, 'skb__k': 'all', 'vec__min_df': 1, 'vec__ngram_range': (1, 2), 'vec__stop_words': 'english'}


### Step 4: Feature Engineering - Target Words

Define 4 new functions to count pronouns that might indicate targetted offense (you, male, female, group/nonbinary)

In [10]:
import re
def youwords(sentence):
    you_words = 0
    for word in sentence.lower().split():
        if re.search("you", word):
            you_words += 1
    return you_words

def malepro(sentence):
    male = 0
    for word in sentence.lower().split():
        if re.search(r"\b(he|his|him)\b", word):
            male += 1
    return male

def femalepro(sentence):
    female = 0
    for word in sentence.lower().split():
        if re.search(r"\b(she|her|hers)\b", word):
            female += 1
    return female

def nonbin(sentence):
    nonbin = 0
    for word in sentence.lower().split():
        if re.search(r"\b(they|them|their)\b", word):
            nonbin += 1
    return nonbin

def pointsCount(sentence):
    points = 0
    for word in sentence.lower().split():
        if re.search(r"!+", word):
            points += 1
    return points

#### Training data

In [11]:
# this creates four features, one for each group of targets
X_train_all_targets = [] 

for item in X_txt_train:
    you_counts = youwords(item)
    male_counts = malepro(item)
    female_counts = femalepro(item)
    nonbin_counts = nonbin(item)
    X_train_all_targets.append([you_counts, male_counts, female_counts, nonbin_counts])

X_train_all_targets = np.array(X_train_all_targets)

#### For the test data

In [12]:
# this creates four features, one for each group of targets
X_test_all_targets = [] 

for item in X_txt_test:
    you_counts = youwords(item)
    male_counts = malepro(item)
    female_counts = femalepro(item)
    nonbin_counts = nonbin(item)
    X_test_all_targets.append([you_counts, male_counts, female_counts, nonbin_counts])

X_test_all_targets = np.array(X_test_all_targets)

### Step 5: Feature Engineering - Positive and Negative Words

Count positive and negative words in each tweet

In [13]:
class LexiconClassifier():
    def __init__(self):
        
        # Initalize the Lexicon classifer by loading lexicons. 
        
        self.positive_words = set()
        with open('positive-words.txt', encoding = 'utf-8') as iFile:
            for row in iFile:
                self.positive_words.add(row.strip())

        self.negative_words = set()
        with open('negative-words.txt', encoding='iso-8859-1') as iFile:
            for row in iFile:
                self.negative_words.add(row.strip())

   
    def count_pos_words(self, sentence):
        #Returns the number of positive words in string
            
        num_pos_words = 0
        for word in sentence.lower().split():
            if word in self.positive_words:
                num_pos_words += 1
        return num_pos_words

    def count_neg_words(self, sentence):
        #Returns the number of negative words in string
            
        num_neg_words = 0
        for word in sentence.lower().split():
            if word in self.negative_words:
                num_neg_words += 1
        return num_neg_words

#### Training data
Instantiate the LC classifier, loop over X_txt_train, append to new list and cast to array to create a new feature: X_train_pos_neg

In [14]:
# This creates a feature of both positive and negative words
myLC = LexiconClassifier()
X_train_pos_neg = [] 

for item in X_txt_train:
    pos_train_counts = myLC.count_pos_words(item)
    neg_train_counts = myLC.count_neg_words(item)
    X_train_pos_neg.append([pos_train_counts,neg_train_counts])

X_train_pos_neg = np.array(X_train_pos_neg)


#### Test data

In [15]:
# This creates a feature of both positive and negative words for the test data
myLC = LexiconClassifier()
X_test_pos_neg = [] 

for item in X_txt_test:
    pos_test_counts = myLC.count_pos_words(item)
    neg_test_counts = myLC.count_neg_words(item)
    X_test_pos_neg.append([pos_test_counts,neg_test_counts])

X_test_pos_neg = np.array(X_test_pos_neg)


### Step 6: Feature Engineering - Offensive Words

Establish a lexicon classifier class using the bad-words.txt to determine if there are offensive words present in each tweet

In [7]:
class OffensiveClassifier():
    def __init__(self):
        # Initalize the Lexicon classifer by loading the bad word lexicon 
        
        self.bad_words = set()
        with open('bad-words.txt', encoding = 'utf-8') as iFile:
            for row in iFile:
                self.bad_words.add(row.strip())

    
    def count_bad_words(self, sentence):
       # Returns the number of bad words in string
         
        num_bad_words = 0
        for word in sentence.lower().split():
            if word in self.bad_words:
                num_bad_words += 1
        return num_bad_words
    
    def bad_words_present (self, sentence):
        #  Returns 1 if bad word in string, 0 if not
           
        bad_word_present = 0
        for word in sentence.lower().split():
            if word in self.bad_words:
                bad_word_present = 1
        return bad_word_present

#### Training data

Instantiate the OC classifier, loop over X_txt_train, append to new list and cast to array to create a new feature: X_train_off_count

In [8]:
myOC = OffensiveClassifier()

X_train_off_count = [] 

for item in X_txt_train:
    bad_count = myOC.count_bad_words(item)
    X_train_off_count.append([bad_count])

X_train_off_count = np.array(X_train_off_count)


#### For the test data

In [9]:
myOC = OffensiveClassifier()

X_test_off_count = [] 

for item in X_txt_test:
    bad_count = myOC.count_bad_words(item)
    X_test_off_count.append([bad_count])

X_test_off_count = np.array(X_test_off_count)

### Step 7: CountVectorizer

#### Training set

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(ngram_range=(1,2), stop_words='english')

X_train_sparse = vec.fit_transform(X_txt_train)

print(X_train_sparse.shape)

(10592, 97807)


#### Test set

In [17]:
X_test_sparse = vec.transform(X_txt_test)
print(X_test_sparse.shape)

(2648, 97807)


### Step 8: Train the Model


Only do this step for the training set

In [18]:
from sklearn.model_selection import GridSearchCV, train_test_split

import scipy.sparse as sp
from scipy.sparse import hstack
import numpy as np
np.random.seed(1)


# if we use .toarray() we will get a dense matrix instead of the sparse matrix given. 
# The sparse matrix does not include all the zero values a dense matrix would
X_train_total = hstack([X_train_sparse, X_train_off_count, X_train_all_targets, X_train_pos_neg])

X_train, X_test, y_train, y_test = train_test_split(X_train_total, y_txt_train, test_size=.2) # 80/20 percent split

In [19]:
from sklearn.metrics import precision_score, f1_score, recall_score, accuracy_score
from sklearn.svm import LinearSVC
from warnings import filterwarnings
filterwarnings('ignore') #turning off convergence warnings because they are annoying!

# all target words, offensive count, and pos/neg lexicon

svc = LinearSVC() 

parameters = {'C':[0.01, 0.1, 1., 10.]}

clf = GridSearchCV(svc, parameters, cv=5, scoring = "f1_macro")
clf.fit(X_train,y_train)

preds = clf.predict(X_test)
macrof1 = f1_score(y_test, preds,average = 'macro')
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average = 'macro') 
recall = recall_score(y_test, preds, average = 'macro') 
microf1 = f1_score(y_test, preds, average = 'micro')
print('f1 macro:', macrof1)
print("f1 micro: {:.4f}".format(microf1))
print("accuracy: {}".format(accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))

f1 macro: 0.5062340278713133
f1 micro: 0.7277
accuracy: 0.7277017461066541
Precision: 0.5975
Recall: 0.4894


In [20]:
# We can see our model is ~99% accurate on the training dataset
preds_train = clf.predict(X_train)
accuracy_train = accuracy_score(y_train, preds_train)
print("accuracy on training data: {}".format(accuracy_train))

accuracy on training data: 0.9964593414375074


### Error Analysis on Validation set

In [21]:
num_tweets = 0
false_pos = 0
false_neg = 0
for pred, y, txt in zip(preds, y_test, X_txt_train):
    if pred == "TIN" and y == "NOT":
        false_pos += 1
    if pred == "UNT" and y == "NOT":
        false_pos += 1
    if pred == "NOT" and y == "UNT":
        false_neg += 1
    if pred == "NOT" and y == "TIN":
        false_neg += 1
     
    num_tweets += 1
    #if num_tweets == 1000:
        #break
print("Total Tweets: {}".format(num_tweets))
print("False Positive: {}".format(false_pos))
print("False Negative: {}".format(false_neg)) 

Total Tweets: 2119
False Positive: 160
False Negative: 363


In [23]:
num_tweets = 0
for pred, y, txt in zip(preds, y_test, X_txt_train):
    print("Tweet: {}".format(txt))
    print("Ground-Truth Class: {}".format(y))
    print("Prediction: {}".format(pred))
    print()
  
    num_tweets += 1
    if num_tweets == 5:
        break

Tweet: @USER She should ask a few native Americans what their take on this is.
Ground-Truth Class: NOT
Prediction: NOT

Tweet: @USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL
Ground-Truth Class: NOT
Prediction: NOT

Tweet: Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT
Ground-Truth Class: NOT
Prediction: NOT

Tweet: @USER Someone should'veTaken" this piece of shit to a volcano. 😂"
Ground-Truth Class: TIN
Prediction: NOT

Tweet: @USER @USER Obama wanted liberals &amp; illegals to move into red states
Ground-Truth Class: NOT
Prediction: NOT



In [24]:
# we can check the performance on any particular class to see if there are clues when it isn't predicting properly
num_tweets = 0
for pred, y, txt in zip(preds, y_test, X_txt_train):
    if y == "UNT":
        print("Tweet: {}".format(txt))
        print("Ground-Truth Class: {}".format(y))
        print("Prediction: {}".format(pred))
        print()
  
        num_tweets += 1
        if num_tweets == 5:
            break

Tweet: @USER Been a Willie fan since before most of you were born....LOVE that he is holding a rally with Beto.... Exactly WHICH fans are furious?  Could you give some specifics? URL
Ground-Truth Class: UNT
Prediction: NOT

Tweet: @USER @USER No conflict here—don’t look at this.  #Soros involved in violent left agenda of Antifa also.
Ground-Truth Class: UNT
Prediction: TIN

Tweet: @USER Holder needs to be prosecuted
Ground-Truth Class: UNT
Prediction: NOT

Tweet: *babysitting 3 kids*  people:”how old do you think she is?” “she has three kids”  me:*dont get mad. don’t get mad. they don’t know. they don’t know*
Ground-Truth Class: UNT
Prediction: NOT

Tweet: @USER @USER Try looking for plain old democrats.  The liberals are the ones which you can’t have a conversation with.
Ground-Truth Class: UNT
Prediction: UNT



### Step 9: Input the Test data into the model and save the predictions

In [25]:
X_test_total = hstack([X_test_sparse, X_test_off_count, X_test_all_targets, X_test_pos_neg])

In [26]:
test_preds = clf.predict(X_test_total)

In [23]:
test_output = np.dstack((ID_txt_test, X_txt_test, test_preds))

test_out = test_output.reshape(len(X_txt_test),3)

np.savetxt('output.tsv', test_out,delimiter="\t",fmt='%s',encoding="utf8")

### Step 10: Error Analysis

In [28]:
num_tweets = 0
for pred, txt in zip(test_preds, X_txt_test):
    print("Tweet: {}".format(txt))
    print("Prediction: {}".format(pred))
    print()
  
    num_tweets += 1
    if num_tweets == 5:
        break

Tweet: @USER Nancy Lee Grahn You Are Awesome! I have been a fan since Santa Barbara!! Alex Davis also Rocks!!!!! Thank you !!!
Prediction: NOT

Tweet: @USER She is a Skrull. Enemy of The Kree. The Kree are who gave Carol her powers and whose uniform she is wearing in the first few moments of the trailer.
Prediction: TIN

Tweet: @USER @USER @USER @USER @USER @USER @USER Except you kind of are when it comes to gun control
Prediction: NOT

Tweet: @USER @USER @USER You are so beautiful♡
Prediction: NOT

Tweet: @USER This is what happens when liberals get in control
Prediction: NOT



In [29]:
# check the performance on any particular class to see if there are clues when it isn't predicting properly
num_tweets = 0
for pred, txt in zip(test_preds, X_txt_test):
    if pred == "UNT":
        print("Tweet: {}".format(txt))
        print("Prediction: {}".format(pred))
        print()
  
        num_tweets += 1
        if num_tweets == 5:
            break

Tweet: @USER I see boobs lol 😂😜😘😇
Prediction: UNT

Tweet: @USER I never said that. I merely refuted the statement that they don’t have a tight end. Virgil Green is a solid tight end when he is surrounded by the talent that the chargers have
Prediction: UNT

Tweet: @USER Fuck the NFL
Prediction: UNT

Tweet: @USER You’re welcome! Yo! @USER and I were holding up a casual tournament watching you. Good shit!
Prediction: UNT

Tweet: @USER what the fuck...
Prediction: UNT

