In [34]:
import numpy as np
import pandas as pd
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from urllib.parse import urlparse
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

In [18]:
import string

def load_data():
    filename = "../data/twitter_data_fixed.pkl"
    print("Loading data from file: " + filename)
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = []
    ids = []
    for i in range(len(data)):
        text = "".join(l for l in data[i]['text'] if l not in string.punctuation)
        x_text.append((data[i]['text']).encode('utf-8'))
        labels.append(data[i]['label'])
    return x_text,labels

In [19]:
def get_model(name):
    if name == "lr":
        model = LogisticRegression()
    elif name == "svm":
        model = LinearSVC()
    elif name == "rf":
        model = RandomForestClassifier(n_jobs=-1)
    elif name == "nb":
        model = MultinomialNB()
    else:
        return None
    return model

def evaluate_model(modelname, features, labels):
    model = get_model(modelname)
    kfoldcv = KFold(n_splits = n_folds)
    scores = []
    n = 0
    for train_index, test_index in kfoldcv.split(features):
        n += 1
        print(n)
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model.fit(X=X_train, y=y_train)
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred, average=None)
        scores.append(f1)

    return np.mean(scores, axis=0)

def is_url(url):
  try:
    result = urlparse(url)
    return all([result.scheme, result.netloc])
  except ValueError:
    return False

In [8]:
model = get_model("svm")
model.fit(X=features, y=labels)
labels_pred = model.predict(features)

NameError: name 'features' is not defined

In [9]:
f1 = f1_score(labels, labels_pred, average=None)

NameError: name 'labels' is not defined

In [10]:
f1

NameError: name 'f1' is not defined

In [20]:
x_text, labels_og = load_data()

Loading data from file: ../data/twitter_data_fixed.pkl


In [21]:
labels, uniques = pd.factorize(labels_og)

In [39]:
comments = pd.DataFrame({'comment': x_text, 'attack': labels})

# decode to UTF-8
comments['comment'] = comments['comment'].str.decode("utf-8")

# remove missing rows
comments['comment'].dropna(inplace=True)

# remove usernames
comments['comment'] = comments['comment'].str.replace('(\@\w+.*?)',"")

# lower case everything
comments['comment'] = comments['comment'].str.lower()

# remove URLs
comments['comment'] = [' '.join(y for y in x.split() if not is_url(y)) for x in comments['comment']]

# remove stop words
comments['comment'] = comments['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

# tokenize
tt = TweetTokenizer()
comments['comment'] = [tt.tokenize(entry) for entry in comments['comment']]

# remove punctuation
comments['comment'] = [list(filter(lambda x: x not in string.punctuation, sentence)) for sentence in comments['comment']]


  comments['comment'] = comments['comment'].str.replace('(\@\w+.*?)',"")


In [43]:
comments.dtypes

comment    object
attack      int64
dtype: object

In [40]:
print(comments)

                                                 comment  attack
0      [rt, another, bloody, instant, restaurant, wee...       0
1      [video, peshmerga, decimating, isis, far, inte...       0
2      [oh, really, instant, restaurants, that's, sho...       0
3      [rt, good, weeks, #isis, new, front, opened, #...       0
4      [rt, don, ’, t, need, femisnsn, men, carry, he...       0
...                                                  ...     ...
16085  [rt, i, want, equal, rights, still, want, seat...       2
16086  [rt, go, ahead, call, sexist, scandalous, wome...       2
16087  [epic, always, kept, plugged, in, plugged, use...       0
16088  [think, daesh, planning, second, battle, trenc...       0
16089  [rt, skin, green, colors, suit, wear, ripped, ...       0

[16090 rows x 2 columns]


In [41]:
textdata = comments['comment']

In [42]:
comments.to_csv("../data/twitter_data_preprocessed2.csv", index=False)

In [44]:
models = ["lr", "svm", "rf", "nb"]
representations = ["character n-grams", "word unigrams"]
n_folds = 10

results = []
for rep in representations:
    if rep == "word unigrams":
        transformer = CountVectorizer(analyzer="word", stop_words="english")
    else:
        transformer = CountVectorizer(analyzer = "char", ngram_range=(1,2))
    count_textdata = transformer.fit_transform(textdata)
    freq_transfomer = TfidfTransformer()
    freq_textdata = freq_transfomer.fit_transform(count_textdata)
    features = freq_textdata

    scores = []
    for modelname in models:
        print(rep, modelname)
        f1_scores = evaluate_model(modelname, features, labels)
        scores.append(f1_scores)

    results.append(pd.Series(scores, index=models))

df = pd.concat(results, axis=0, keys=representations)

AttributeError: 'list' object has no attribute 'lower'

In [61]:
print(uniques)
print(df)

['none' 'racism' 'sexism']
character n-grams  lr     [0.8751335072171373, 0.685327978133294, 0.6180...
                   svm    [0.8782619140294108, 0.6970001698013153, 0.642...
                   rf     [0.8661223589261834, 0.534585081271872, 0.5346...
                   nb     [0.8189128812320003, 0.02636348677077856, 0.13...
word unigrams      lr     [0.8823246105420989, 0.6730464408470065, 0.646...
                   svm    [0.8871198073094947, 0.721519829764152, 0.6932...
                   rf     [0.8871867733014165, 0.7372396761164366, 0.665...
                   nb     [0.8480896742923367, 0.40042297496296236, 0.39...
dtype: object


In [64]:
df.name = "F1_Twitter"
df.to_frame().to_csv("results_twitter.csv", index_label=['processing', 'model', ['none', 'racism', 'sexism']])