In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.utils import resample

In [3]:
df = pd.read_csv('data_lem.csv')

In [4]:
df.head()

Unnamed: 0,lemmatized,target
0,"['hate', 'thug']",1
1,"['really', ""can't"", 'stand', 'thug', 'like']",1
2,"['despise', 'thug']",1
3,"['detest', 'thug']",1
4,"['absolutely', 'loathe', 'thug']",1


In [5]:
import ast ## This module can be used to evaluate literals, eg: transform string-lists back into lists
def extract_genres(x):
    x = ast.literal_eval(x)
    return x

In [6]:
df['lemmatized'] = df['lemmatized'].apply(extract_genres)
df.head()

Unnamed: 0,lemmatized,target
0,"[hate, thug]",1
1,"[really, can't, stand, thug, like]",1
2,"[despise, thug]",1
3,"[detest, thug]",1
4,"[absolutely, loathe, thug]",1


In [7]:
df['lemmatized'] = [' '.join(x) for x in df['lemmatized']]
df.head()

Unnamed: 0,lemmatized,target
0,hate thug,1
1,really can't stand thug like,1
2,despise thug,1
3,detest thug,1
4,absolutely loathe thug,1


In [8]:
X = df['lemmatized']

y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14)

In [9]:
Xy_train = pd.concat([X_train, y_train], axis=1)
Xy_train

Unnamed: 0,lemmatized,target
1839,nothing common ghetto trahs,1
7092,never support politician treat autoandrophiles...,0
9024,I'd rather cancre anywhere near Aiden,1
5473,banker c n c e r,1
9764,Saying make bit bastard,0
...,...,...
7526,clownfishs beautiful,0
6471,hurt autogynephiles like,1
2454,bings belong zoo,1
9484,obvious troons opposite stupid,0


In [10]:
Xy_train.target.value_counts()

1    5833
0    2150
Name: target, dtype: int64

### Upsampling

In [11]:
majority = Xy_train[Xy_train['target'] == 1]
minority = Xy_train[Xy_train['target'] == 0]

In [12]:
minority_upsampled = resample(minority, replace=True, n_samples=5833, random_state=14)
upsampled = pd.concat([majority, minority_upsampled])
upsampled['target'].value_counts()

1    5833
0    5833
Name: target, dtype: int64

In [13]:
X_train_up = upsampled['lemmatized']
y_train_up = upsampled['target']

### Training Model

In [14]:
# vectorizer = CountVectorizer()
vectorizer = CountVectorizer(analyzer = "word",
                             lowercase=True,
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 100)
X_train_up_vec = vectorizer.fit_transform(X_train_up)
X_test_vec = vectorizer.transform(X_test)

In [15]:
classifier = LogisticRegression()
classifier.fit(X_train_up_vec, y_train_up)

In [16]:
y_pred = classifier.predict(X_train_up_vec)
accuracy = accuracy_score(y_train_up, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7764443682496143


In [17]:
y_pred = classifier.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7710420841683366


## Function to test max_features

In [18]:
max_list = [200, 250, 300, 350, 400]
train_accuracy = []
test_accuracy = []
for x in max_list:
    vectorizer = CountVectorizer(analyzer = "word",
                                 lowercase=True,
                                 tokenizer = None,
                                 preprocessor = None,
                                 stop_words = None,
                                 max_features = x)

    X_train_up_vec = vectorizer.fit_transform(X_train_up)
    X_test_vec = vectorizer.transform(X_test)

    classifier = LogisticRegression()
    classifier.fit(X_train_up_vec, y_train_up)

    y_pred_train = classifier.predict(X_train_up_vec)
    acc = accuracy_score(y_train_up, y_pred_train)
    train_accuracy.append(acc)

    y_pred_test = classifier.predict(X_test_vec)
    acc2 = accuracy_score(y_test, y_pred_test)
    test_accuracy.append(acc2)

data = {'max_features': max_list, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy}

max_df = pd.DataFrame(data)

In [19]:
max_df

Unnamed: 0,max_features,train_accuracy,test_accuracy
0,200,0.849991,0.860721
1,250,0.92791,0.923848
2,300,0.970341,0.964429
3,350,0.986628,0.983968
4,400,0.989799,0.98497


#### Burrowing Down Further

In [20]:
max_list = [300, 305, 310, 315, 320, 330, 340, 350]
train_accuracy = []
test_accuracy = []
for x in max_list:
    vectorizer = CountVectorizer(analyzer = "word",
                                 lowercase=True,
                                 tokenizer = None,
                                 preprocessor = None,
                                 stop_words = None,
                                 max_features = x)

    X_train_up_vec = vectorizer.fit_transform(X_train_up)
    X_test_vec = vectorizer.transform(X_test)

    classifier = LogisticRegression()
    classifier.fit(X_train_up_vec, y_train_up)

    y_pred_train = classifier.predict(X_train_up_vec)
    acc = accuracy_score(y_train_up, y_pred_train)
    train_accuracy.append(acc)

    y_pred_test = classifier.predict(X_test_vec)
    acc2 = accuracy_score(y_test, y_pred_test)
    test_accuracy.append(acc2)

data = {'max_features': max_list, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy}

max_df = pd.DataFrame(data)

In [21]:
max_df

Unnamed: 0,max_features,train_accuracy,test_accuracy
0,300,0.970341,0.964429
1,305,0.970341,0.964429
2,310,0.970513,0.963928
3,315,0.974713,0.971944
4,320,0.975399,0.970441
5,330,0.982342,0.976954
6,340,0.986199,0.983968
7,350,0.986628,0.983968


## Testing against original data

##### The Model with 330 Max Features

In [22]:
vectorizer = CountVectorizer(analyzer = "word",
                                 lowercase=True,
                                 tokenizer = None,
                                 preprocessor = None,
                                 stop_words = None,
                                 max_features = 330)

X_train_up_vec = vectorizer.fit_transform(X_train_up)
X_test_vec = vectorizer.transform(X_test)

classifier = LogisticRegression()
classifier.fit(X_train_up_vec, y_train_up)

##### The New Data

In [23]:
half_test_df = pd.read_csv('raw_half_test.csv')
heavy_test_df = pd.read_csv('raw_heavy_test.csv')
light_test_df = pd.read_csv('raw_light_test.csv')

#### Processing New Data

In [24]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from string import punctuation
punctuation = list(punctuation)
from nltk.stem import 	WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def data_prep(data):

    # tokenization
    data['case_templ_token'] = data['case_templ'].apply(lambda x: tokenizer.tokenize(x))

    # Removing Stopwords
    def remove_stopwords(tokens):
        filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
        return filtered_tokens
    data['case_templ_no_stopwords'] = data['case_templ_token'].apply(remove_stopwords)

    # Removing Punctuation
    def remove_punctuation(tokens):
        clean_tokens = [token for token in tokens if token not in punctuation]
        return clean_tokens
    data['case_templ_clean'] = data['case_templ_no_stopwords'].apply(remove_punctuation)

    # Lemmatize
    def lemmatize_tokens(tokens):
        lemmatized_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
        return lemmatized_tokens
    data['lemmatized'] = data['case_templ_clean'].apply(lemmatize_tokens)


In [25]:
data_prep(half_test_df)
data_prep(heavy_test_df)
data_prep(light_test_df)

In [26]:
columns_to_drop = ['case_templ_token', 'case_templ_no_stopwords', 'case_templ_clean']
half_test_df.drop(columns=columns_to_drop, inplace=True)
heavy_test_df.drop(columns=columns_to_drop, inplace=True)
light_test_df.drop(columns=columns_to_drop, inplace=True)

#### Combining X and y test so can add new datasets

In [27]:
Xy_test = pd.concat([X_test, y_test], axis=1)
Xy_test

Unnamed: 0,lemmatized,target
3205,nothing worse cultural marxist,1
9479,troons complete 4ssholes,1
4745,Putting coastal elite charge best thing anyone,0
1315,gangbangers burned stake,1
3468,Deathto cultural marxist,1
...,...,...
1149,dindus wonderful being,0
3464,cultural marxist without doubt stupidestpeople...,1
1152,dindus blessing society,0
2714,Let's stand bings getting murdered,0


#### Creating Full New Test Datasets

In [28]:
half_test = half_test_df[['lemmatized', 'target']].copy()
half_test['lemmatized'] = [' '.join(x) for x in half_test['lemmatized']]
half_test_combined = pd.concat([Xy_test, half_test])

In [29]:
heavy_test = heavy_test_df[['lemmatized', 'target']].copy()
heavy_test['lemmatized'] = [' '.join(x) for x in heavy_test['lemmatized']]
heavy_test_combined = pd.concat([Xy_test, heavy_test])

In [30]:
light_test = light_test_df[['lemmatized', 'target']].copy()
light_test['lemmatized'] = [' '.join(x) for x in light_test['lemmatized']]
light_test_combined = pd.concat([Xy_test, light_test])

#### Splitting into X and y

In [31]:
half_test_combined_X = half_test_combined['lemmatized']
half_test_combined_y = half_test_combined['target']
half_test_X = half_test['lemmatized']
half_test_y = half_test['target']
heavy_test_combined_X = heavy_test_combined['lemmatized']
heavy_test_combined_y = heavy_test_combined['target']
heavy_test_X = heavy_test['lemmatized']
heavy_test_y = heavy_test['target']
light_test_combined_X = light_test_combined['lemmatized']
light_test_combined_y = light_test_combined['target']
light_test_X = light_test['lemmatized']
light_test_y = light_test['target']

#### Vectorizing

In [32]:
half_test_combined_X_vec = vectorizer.transform(half_test_combined_X)
half_test_X_vec = vectorizer.transform((half_test_X))
heavy_test_combined_X_vec = vectorizer.transform(heavy_test_combined_X)
heavy_test_X_vec = vectorizer.transform((heavy_test_X))
light_test_combined_X_vec = vectorizer.transform(light_test_combined_X)
light_test_X_vec = vectorizer.transform((light_test_X))

#### Making Predictions

In [33]:
half_test_combined_pred = classifier.predict(half_test_combined_X_vec)
half_test_pred = classifier.predict(half_test_X_vec)
heavy_test_combined_pred = classifier.predict(heavy_test_combined_X_vec)
heavy_test_pred = classifier.predict(heavy_test_X_vec)
light_test_combined_pred = classifier.predict(light_test_combined_X_vec)
light_test_pred = classifier.predict(light_test_X_vec)

#### Accuracy

In [34]:
half_test_combined_acc = accuracy_score(half_test_combined_y, half_test_combined_pred)
half_test_acc = accuracy_score(half_test_y, half_test_pred)
heavy_test_combined_acc = accuracy_score(heavy_test_combined_y, heavy_test_combined_pred)
heavy_test_acc = accuracy_score(heavy_test_y, heavy_test_pred)
light_test_combined_acc = accuracy_score(light_test_combined_y, light_test_combined_pred)
light_test_acc = accuracy_score(light_test_y, light_test_pred)

#### Accuracy Table

In [35]:
Datasets = ['half_test_combined', 'half_test', 'heavy_test_combined', 'heavy_test', 'light_test_combined', 'light_test']
Accuracy = [half_test_combined_acc, half_test_acc, heavy_test_combined_acc, heavy_test_acc, light_test_combined_acc, light_test_acc]

data = {'Datasets': Datasets, 'Accuracy': Accuracy}

Acc_table = pd.DataFrame(data)
Acc_table

Unnamed: 0,Datasets,Accuracy
0,half_test_combined,0.97657
1,half_test,0.9
2,heavy_test_combined,0.97657
3,heavy_test,0.9
4,light_test_combined,0.977069
5,light_test,1.0


#### In depth on 50/50

In [38]:
half_test_results = half_test_df.copy()
half_test_results['probability'] = classifier.predict_proba(half_test_X_vec)[:,1]
half_test_results['prediction'] = classifier.predict(half_test_X_vec)
half_test_results['confidence'] = classifier.decision_function(half_test_X_vec)
half_test_results.drop(columns='lemmatized', inplace=True)
half_test_results

Unnamed: 0,case_templ,target,probability,prediction,confidence
0,obama is another deadbeat dad,1,0.896984,1,2.164149
1,another example of the dangerous impact of the...,1,0.788981,1,1.318793
2,our streets are now overrun with dindus,1,0.889937,1,2.090098
3,the demographic of my small town is now what y...,1,0.106196,0,-2.130198
4,this fits right into the cosmopolitan agenda,1,0.96524,1,3.323899
5,politicians are right scumbags,0,0.011448,0,-4.458452
6,"ah, a cup of tea, the true great british pasttime",0,0.002157,0,-6.137067
7,I really hate what the council is doing to our...,0,0.105339,0,-2.13926
8,Modern media like netflix and hulu are so bori...,0,0.266182,0,-1.014078
9,people do not believe me when I say turkey is ...,0,4.1e-05,0,-10.096707


In [37]:
half_test_results.to_csv('LogRegLem.csv')

### More Metrics

In [42]:
from sklearn import metrics

In [40]:
def apr(y_pred, y_real):                                            # function to calculate the accuracy, precision and recall
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")
    return accuracy, precision, recall, f1

In [43]:
apr(half_test_y, half_test_pred)

Accuracy:0.9
Precision:0.8
Recall:1.0
F1:0.888888888888889


(0.9, 0.8, 1.0, 0.888888888888889)