## This notebook is used to generate the finalized version of the classifier, to simply feature transformation into the final form, and to test that the results are the same

Most of the code comes from operational_classifier.

In [1]:
import pandas as pd
import numpy as np
import pickle
import sys
#reload(sys)
#sys.setdefaultencoding("utf-8")
# Choose the data we want to use.
# Typically I use data_name to signify which data I used, like 'black' means the file black_dataset.csv, 'balanced' means the file balanced_dataset.csv
#data_name = 'black'
#data_name = 'women'
data_name = 'balanced'

#Loading raw data
df = pd.read_csv(open(f"../data/{data_name}_dataset.csv",'rb'))

model_file = f'../data/{data_name}_model.pkl'
tfidf_file = f'../data/{data_name}_tfidf.pkl'
idf_file = f'../data/{data_name}_idf.pkl'
pos_file = f'../data/{data_name}_pos.pkl'
oth_file = f'../data/{data_name}_oth.pkl'

tweets = df.text

## Feature generation

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re
import joblib

stopwords=stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    #parsed_text = parsed_text.code("utf-8", errors='ignore')
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]+", tweet.lower())).strip()
    #tokens = re.split("[^a-zA-Z]*", tweet.lower())
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]+", tweet.lower())).strip()
    return tweet.split()

vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords, #We do better when we keep stopwords
    use_idf=True,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=10000,
    min_df=5,
    max_df=0.501
    )

In [3]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names_out())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores



In [4]:
#Get POS tags for tweets and save as a string
tweet_tags = []
for t in tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    #for i in range(0, len(tokens)):
    tag_str = " ".join(tag_list)
    tweet_tags.append(tag_str)
        #print(tokens[i],tag_list[i])

In [5]:
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
pos_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=5000,
    min_df=5,
    max_df=0.501,
    )

In [6]:
#Construct POS TF matrix and get vocab dict
pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names_out())}

In [7]:
#Now get other features
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *

sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
    retweet = 0
    if "rt" in words:
        retweet = 1
    features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

In [8]:
other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", "vader compound", \
                        "num_hashtags", "num_mentions", "num_urls", "is_retweet"]

In [9]:
feats = get_feature_array(tweets)
pos_full = pos_vectorizer.get_feature_names_out()
print(pos_full)
print(other_features_names)

['CC' 'CC CC' 'CC CC VB' ... 'WRB VBZ PRP' 'WRB VBZ VBG' 'WRB WRB']
['FKRA', 'FRE', 'num_syllables', 'avg_syl_per_word', 'num_chars', 'num_chars_total', 'num_terms', 'num_words', 'num_unique_words', 'vader neg', 'vader pos', 'vader neu', 'vader compound', 'num_hashtags', 'num_mentions', 'num_urls', 'is_retweet']


In [10]:
#Now join them all up
M = np.concatenate([tfidf,pos,feats],axis=1)

In [11]:
M.shape

(10000, 9094)

In [12]:
#Finally get a list of variable names
variables = ['']*len(vocab)
for k,v in vocab.items():
    variables[v] = k

pos_variables = ['']*len(pos_vocab)
for k,v in pos_vocab.items():
    pos_variables[v] = k

feature_names = variables+pos_variables+other_features_names

# Running the model

This model was found using a GridSearch with 5-fold cross validation. Details are in the notebook operational_classifier.

In [13]:
X = pd.DataFrame(M)
y = df['class'].astype(int)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

In [15]:
select = SelectFromModel(LogisticRegression(class_weight='balanced',solver='liblinear',penalty="l1",C=0.01))
X_ = select.fit_transform(X,y)

In [16]:
model = LinearSVC(class_weight='balanced',C=0.01, penalty='l2', loss='squared_hinge',multi_class='ovr').fit(X_, y)
joblib.dump(model, model_file) 



['../data/balanced_model.pkl']

In [17]:
y_preds = model.predict(X_)

In [18]:
report = classification_report( y, y_preds )

In [19]:
print(report)

              precision    recall  f1-score   support

           0       0.57      0.77      0.66      2644
           2       0.91      0.79      0.85      7356

    accuracy                           0.79     10000
   macro avg       0.74      0.78      0.75     10000
weighted avg       0.82      0.79      0.80     10000



# Using information from the model to obtain the matrix X_ generically

This is the most difficult task: We have to take the inputs tweets and transform them into a format that can be used in the model without going through all the same pre-processing steps as above. This can be done as follows.

## Obtaining information about the model

In [20]:
final_features = select.get_support(indices=True) #get indices of features
final_feature_list = [(feature_names[i]) for i in final_features] #Get list of names corresponding to indices

In [21]:
print(final_feature_list)

['africa', 'america', 'ass', 'babi', 'back', 'becaus', 'bitch', 'black', 'cock', 'cunt', 'deport', 'disgust', 'dumb', 'estrogen', 'face', 'fag', 'faggot', 'feel', 'fuck', 'gender', 'girl', 'go', 'hoe', 'hole', 'homosexu', 'hope', 'human', 'idiot', 'jew', 'know', 'like', 'littl', 'lol', 'look like', 'love', 'mean', 'mouth', 'nigger', 'peopl', 'piec', 'pride', 'pussi', 'put', 'racism', 'racist', 'retard', 'say', 'shoot', 'shut', 'slut', 'still', 'stupid', 'take', 'tran', 'tranni', 'trash', 'ugli', 'url', 'wa', 'whore', 'woman', 'women', 'VB PRP', 'VBZ', 'FKRA', 'FRE', 'num_syllables', 'num_chars', 'num_chars_total', 'num_words', 'vader compound']


In [22]:
#Getting names for each class of features
pos_indices = []
for item in pos_full:
    if item in final_feature_list:
        # If item is found, append its lowest index to the indices list
        pos_indices.append(final_feature_list.index(item))

pos_min_index = min(pos_indices)
pos_max_index = max(pos_indices)
ngram_features = final_feature_list[:pos_min_index]
pos_features = final_feature_list[pos_min_index:pos_max_index+1]
oth_features = final_feature_list[pos_max_index+1:]

#ngram_features = final_feature_list[:final_feature_list.index('zionist')+1]
#pos_features = final_feature_list[final_feature_list.index('zionist')+1:final_feature_list.index('WRB')+1]
#oth_features = final_feature_list[final_feature_list.index('WRB')+1:]

In [23]:
print(pos_features)
print(oth_features)
joblib.dump(oth_features, oth_file) 

['VB PRP', 'VBZ']
['FKRA', 'FRE', 'num_syllables', 'num_chars', 'num_chars_total', 'num_words', 'vader compound']


['../data/balanced_oth.pkl']

## Generating ngram features

In [24]:
new_vocab = {v:i for i, v in enumerate(ngram_features)}
new_vocab_to_index = {}
for k in ngram_features:
    new_vocab_to_index[k] = vocab[k]

In [25]:
#Get indices of text features
ngram_indices = final_features[:len(ngram_features)]

In [26]:
#TODO: Pickle new vectorizer

In [27]:
new_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    min_df=1,
    max_df=1.0,
    vocabulary=new_vocab
    )

In [28]:
joblib.dump(new_vectorizer, tfidf_file) 

['../data/balanced_tfidf.pkl']

In [29]:
tfidf_ = new_vectorizer.fit_transform(tweets).toarray()



In [30]:
#Verifying that results are the same

In [31]:
tfidf_[1,:]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [32]:
tfidf_[1,:].sum()

2.0

In [33]:
X_[1,:tfidf_.shape[1]]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       3.79197544, 0.        , 4.83970234, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ])

In [34]:
X_[1,:tfidf_.shape[1]].sum()

8.631677779888491

Results are the same if use IDF but the problem is that IDF will be different if we use different data. Instead we have to use the original IDF scores and multiply them by the new matrix.

In [35]:
print(len(tfidf))

10000


In [36]:
idf_vals_ = idf_vals[ngram_indices]

In [37]:
idf_vals_.shape

(62,)

In [38]:
#TODO: Pickle idf_vals

joblib.dump(idf_vals_, idf_file) 

['../data/balanced_idf.pkl']

In [39]:
(tfidf_[1,:]*idf_vals_) == X_[1,:153] #Got same value as final process array!

  (tfidf_[1,:]*idf_vals_) == X_[1,:153] #Got same value as final process array!


False

In [40]:
tfidf_*idf_vals_ == X_[:,:153]

  tfidf_*idf_vals_ == X_[:,:153]


False

In [41]:
tfidffinal = tfidf_*idf_vals_

## Generating POS features
This is simpler as we do not need to worry about IDF but it will be slower as we have to compute the POS tags for the new data. Here we can simply use the old POS tags.

In [42]:
new_pos = {v:i for i, v in enumerate(pos_features)}

In [43]:
#TODO: Pickle pos vectorizer
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
new_pos_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    min_df=1,
    max_df=1.0,
    vocabulary=new_pos
    )

In [44]:
joblib.dump(new_pos_vectorizer, pos_file) 

['../data/balanced_pos.pkl']

In [45]:
pos_ = new_pos_vectorizer.fit_transform(tweet_tags).toarray()

In [46]:
pos_[1,:]

array([1., 0.])

In [47]:
X_[1,153:159]

array([], dtype=float64)

In [48]:
pos_[:,:] == X_[:,153:159]

  pos_[:,:] == X_[:,153:159]


False

In [49]:
pos_[:,:].sum()

10463.0

In [50]:
X_[:,153:159].sum()

0.0

## Finally, we can look at the other features

In [51]:
print(other_features_names)

['FKRA', 'FRE', 'num_syllables', 'avg_syl_per_word', 'num_chars', 'num_chars_total', 'num_terms', 'num_words', 'num_unique_words', 'vader neg', 'vader pos', 'vader neu', 'vader compound', 'num_hashtags', 'num_mentions', 'num_urls', 'is_retweet']


In [52]:
print(oth_features)

['FKRA', 'FRE', 'num_syllables', 'num_chars', 'num_chars_total', 'num_words', 'vader compound']


The functions can be modified to only calculate and return necessary fields.

In [53]:
def other_features_(tweet, oth_features):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    retweet = 0
    if "rt" in words:
        retweet = 1
    
    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
    
    features = []
        
    if "FKRA" in oth_features:
        features.append(FKRA)
    if "FRE" in oth_features:
        features.append(FRE)
    if "num_syllables" in oth_features:
        features.append(syllables)
    if "avg_syl_per_word" in oth_features:
        features.append(avg_syl)
    if "num_chars" in oth_features:
        features.append(num_chars)
    if "num_chars_total" in oth_features:
        features.append(num_chars_total)
    if "num_terms" in oth_features:
        features.append(num_terms)
    if "num_words" in oth_features:
        features.append(num_words)
    if "num_unique_words" in oth_features:
        features.append(num_unique_terms)
    if "vader neg" in oth_features:
        features.append(sentiment['neg'])
    if "vader pos" in oth_features:
        features.append(sentiment['pos'])
    if "vader neu" in oth_features:
        features.append(sentiment['neu'])
    if "vader compound" in oth_features:
        features.append(sentiment['compound'])
    if "num_hashtags" in oth_features:
        features.append(twitter_objs[2])
    if "num_mentions" in oth_features:
        features.append(twitter_objs[1])
    if "num_urls" in oth_features:
        features.append(twitter_objs[0])
    if "is_retweet" in oth_features:
        features.append(retweet)

    #features = [FKRA, FRE, syllables, num_chars, num_chars_total, num_terms,
    #            num_unique_terms, sentiment['neg'], sentiment['compound'],
    #            twitter_objs[2]]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array_(tweets, oth_features):
    feats=[]
    for t in tweets:
        feats.append(other_features_(t, oth_features))
    return np.array(feats)

In [54]:
oth_feature_names = joblib.load(oth_file)
feats_ = get_feature_array_(tweets, oth_feature_names)

In [55]:
feats_[0,:]


array([  0.5   , 102.05  ,   7.    ,  30.    ,  30.    ,   6.    ,
         0.5538])

In [56]:
X_[0,159:]

array([], dtype=float64)

In [57]:
feats_[:,:] == X_[:,159:]

  feats_[:,:] == X_[:,159:]


False

## Now that we have put it all together using a simplified process we can assess if these new data return the same answers.

In [58]:
M_ = np.concatenate([tfidffinal, pos_, feats_],axis=1)

In [59]:
print(M_.shape)
print(tfidffinal.shape)
print(pos_.shape)
print(feats_.shape)

(10000, 71)
(10000, 62)
(10000, 2)
(10000, 7)


In [60]:
X__ = pd.DataFrame(M_)

In [61]:
y_preds_ = model.predict(X__)

In [62]:
report = classification_report( y, y_preds_ )

In [63]:
print(report)

              precision    recall  f1-score   support

           0       0.57      0.77      0.66      2644
           2       0.91      0.79      0.85      7356

    accuracy                           0.79     10000
   macro avg       0.74      0.78      0.75     10000
weighted avg       0.82      0.79      0.80     10000



OK. So now that we have verified that the results are the same with X_ and X__ we can implement a script that can transform new data in this manner.