In [66]:
# utils and general stuff
import pandas as pd
import numpy as np
from exam_utils import timeParser
import re

#Packages to create DFM
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.tokenize import TweetTokenizer

#Models to train
from sklearn.linear_model import LogisticRegression

#Packages for cross-validation and parameter tuning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix

In [6]:
df = pd.read_csv('lemma_all.csv', compression='zip')

In [7]:
df.loc[:, 'tweet_created_at'] = df.tweet_created_at.apply(timeParser)

In [8]:
# subsetting only after the electiong and making a copy to get rid of the setting with copy warning
df_ae = df.loc[df.tweet_created_at > '2019-06-05'].copy()  

In [9]:
df_ae.loc[:, 'tweet_id'] = df_ae.loc[:, 'tweet_id'].astype(int)

In [10]:
# removing all retweets
df_ae = df_ae.loc[~df_ae.tweet_full_text.str.contains('^RT')]

# dropping nans in tweet lemma
df_ae = df_ae.dropna(subset=['tweet_text_lemma'])

In [30]:
def sample_dataset(df, n=100, random_state=42):
    '''Takes in a df and returns 100 random tweets to be labelled'''
    temp = df.sample(n, random_state=random_state)
    temp.loc[:, 'label'] = np.nan
    return temp

In [31]:
# Getting the first dataset to label!
label = sample_dataset(df_ae)

In [32]:
label.to_excel('label_this.xlsx')

# Active learning loop

## importing data and splitting into X and y

In [33]:
def split_data(path, test_data=False):
    '''takes in the path to the latest labelled data set and returns X_train, y_train, and a df
    could have used train_test_split'''
    new_df = pd.read_excel(path, index_col=0)
    X = new_df.tweet_text_lemma
    y = new_df.label
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    if test_data:
        return X_train, X_test, y_train, y_test, new_df
    else:
        return X, y, new_df

In [25]:
def labelled_unlabelled(new_df, old_df):
    '''takes in the new df and removes the ones in the new one from the old one'''
    unlabelled_df = old_df.loc[~old_df.index.isin(new_df.index)]
    return unlabelled_df

In [46]:
X_train, y_train, labelled_df = split_data('label5.xlsx')

In [47]:
unlabelled_df = labelled_unlabelled(labelled_df, df_ae)

In [48]:
# pipeline to train on
tokenizer = TweetTokenizer()

pipeline = Pipeline([ 
    ('cv', CountVectorizer(tokenizer=tokenizer.tokenize, ngram_range = (1, 2), max_df=0.999, min_df=0.01)),
    ('tfidf', TfidfTransformer(use_idf = False)),
    ('logreg', LogisticRegression())
])

In [49]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('cv',
                 CountVectorizer(max_df=0.999, min_df=0.01, ngram_range=(1, 2),
                                 tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7fefe80bf0d0>>)),
                ('tfidf', TfidfTransformer(use_idf=False)),
                ('logreg', LogisticRegression())])

In [50]:
def predict_unlabelled(pipeline, unlabelled_df):
    '''takes in a pipeline, the unlabelled df and adds the maximum probability column
    Then it sorts the dataframe by max proba and returns it'''
    # predicts for the three classes for all entries in the dataset
    predictions = pipeline.predict_proba(unlabelled_df.tweet_text_lemma)
    # creates a column with the max probability
    temp = unlabelled_df.copy()
    temp.loc[:, 'max_proba'] = [max(pred) for pred in predictions]
    return temp

In [51]:
unlabelled_df = predict_unlabelled(pipeline, unlabelled_df)


In [52]:
pipeline.score(X_test, y_test)

0.7636363636363637

In [59]:
def label_new_set(unlabelled_df, labelled_df, new_name):
    '''takes in the df produced above, sorts it and saves a new df to be labelled'''
    unlabelled_df.sort_values(by='max_proba', inplace=True)
    new_df = unlabelled_df[:100].copy()
    new_df.loc[:, 'label'] = np.nan
    new_df = pd.concat([new_df, labelled_df])
    new_df.to_excel(f'{new_name}.xlsx')
    return None

In [60]:
label_new_set(unlabelled_df, labelled_df, 'label6')

## Checking the current score

In [None]:
#Fill in the parameter values in the grid 
parameter_grid = {
    'tfidf__use_idf': [False, True],
    'logreg__penalty': ['l1', 'l2', 'elasticnet'],
    'logreg__C': [0.1, 0.5, 1],
}

#Initializing a kfold with 5 folds
cv = StratifiedKFold(n_splits=5)

#Initializing the GridSearchCV
search = GridSearchCV(pipeline, parameter_grid, cv=cv, n_jobs = -1, verbose=10)

In [None]:
search.fit(X_train, y_train)

In [53]:
pipeline.score(X_train, y_train)

0.778

In [55]:
y_pred = pipeline.predict(X_train)

In [57]:
conf = confusion_matrix(y_train, y_pred)

In [58]:
print(conf)

[[209   3   0   3]
 [ 51  75   1   5]
 [ 35   6  36   1]
 [  5   0   1  69]]
