# Reading Data
Reading files that contain the biased (WikiDetox) and unbiased (50/50 aggressive/unaggressive) data

In [23]:
#unbiased data input
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import ndimage as nd
import re

Number_of_entry = 29560 #maximum acceptable 29560
#input data and clean up
agg_data = pd.read_csv('Aggressive_comments.tsv', sep = '\t', nrows = Number_of_entry/2)
neu_data = pd.read_csv('Neutral_comments.tsv', sep = '\t', nrows = Number_of_entry/2)

agg_comment = agg_data.comment
neu_comment = neu_data.comment
agg_target = agg_data.aggression
neu_target = neu_data.aggression

comment_data = np.asarray(agg_comment.append(neu_comment))
target_unbias = np.asarray(agg_target.append(neu_target))
print("unbias data inputed")
print("comment_data & target_unbias")



unbias data inputed
comment_data & target_unbias


In [24]:
#Test unbiased scored data 
#new
agg_data_s = pd.read_csv('Aggressive_scored.tsv', sep = '\t', nrows = Number_of_entry/2)
neu_data_s = pd.read_csv('Friendlier_scored.tsv', sep = '\t', nrows = Number_of_entry/2)

agg_comment_s = agg_data_s.comment
neu_comment_s = neu_data_s.comment
agg_target_s = agg_data_s.aggression_score
neu_target_s = neu_data_s.aggression_score

trial_comment = np.asarray(agg_comment_s.append(neu_comment_s))
trial_score = np.asarray(agg_target_s.append(neu_target_s))
print("trial data inputed")
print("trial_comment & trial_score")

trial data inputed
trial_comment & trial_score


# Preprocessing
Removes punctuation, newlinetoken, stop words for unbiased and biased data, lemmatizes

In [25]:
#first step of preprocessing
PUNCTUATION_NO_SPACE = re.compile("[.;:!*=<>`_'?¿,\"()\[\]]")
PUNCTUATION_SPACE = re.compile("-")
NEWLINE = re.compile("newlinetoken")
#This is more or less the nltk stop list with negations removed
skip = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", 'im',"youre",
        "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 
        'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 
        'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 
        "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 
        'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or', 'ive',
        'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 
        'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 
        'out', 'on', 'off', 'over', 'under', 'further', 'then', 'once', 'here', 'there', 'theres','when', 'where',
        'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
        'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 
        'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y','u','ur']

#unbiased
for i in range(comment_data.size):
    comment_data[i] = PUNCTUATION_NO_SPACE.sub("",comment_data[i].lower())
    comment_data[i] = NEWLINE.sub("",comment_data[i])
    comment_data[i] = PUNCTUATION_SPACE.sub(" ",comment_data[i].lower())
    comment_data[i] = comment_data[i].split()
    comment_data[i] = [word for word in comment_data[i] if word not in skip]

In [26]:
#Removing different ending of the same word
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

for i in range(comment_data.size):
    lemmatizer = WordNetLemmatizer()
    comment_data[i] = ' '.join([lemmatizer.lemmatize(word) for word in comment_data[i]])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Kiara2.0/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
for i in range(target_unbias.size):
    if (target_unbias[i] > 0.5):
        target_unbias[i] = 1
    else:
        target_unbias[i] = 0

# Model for binary prediction
Uses the TfidfVectorizer and LinearSVC model

In [28]:
#model for binary predicting
#default TF-idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC


#print(type(comment_data[0]))
ngram_arc = np.zeros(5)

comment_data = np.asarray(comment_data)

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
tfidf_vectorizer.fit(comment_data)
comment_vectorized = tfidf_vectorizer.transform(comment_data)
    
def accuracy_test(ind, c=1, trials=50):
    total = 0
    #print(comment_vectorized.shape)
    model = None
    for i in range(trials):
        data_train, data_test, target_train, target_test = train_test_split(
            comment_vectorized, target_unbias, test_size = 0.1)

        model = LogisticRegression(C=c)
        model.fit(data_train, target_train)
        total += accuracy_score(target_test, model.predict(data_test))
    accuracy = total / trials
    ngram_arc[ind] = accuracy
    print("Unbias data Accuracy: ", accuracy)
    return model

model_ub = accuracy_test(ind = 0)

Unbias data Accuracy:  0.8711163734776727


# User Interaction
Uses LinearSVC model with Ngram = 1 to predict user input live

In [30]:
#Predicting user input

user_input = input("Write your comment here: ")
#print(user_input)

user_input = PUNCTUATION_NO_SPACE.sub("",user_input.lower())
user_input = NEWLINE.sub("",user_input)
user_input = PUNCTUATION_SPACE.sub(" ",user_input.lower())
user_input = user_input.split()
user_input = [word for word in user_input if word not in skip]

lemmatizer = WordNetLemmatizer()
comment_data[i] = ' '.join([lemmatizer.lemmatize(word) for word in user_input])

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
tfidf_vectorizer.fit(comment_data)
user_input_vec = tfidf_vectorizer.transform(user_input)

predicted_result = model_ub.predict(user_input_vec)

if (np.mean(predicted_result > 0.5)):
    print("---Your input comment is classified as aggressive!---")
else:
    print("---Your input comment is classified as nonaggressive!---")

Write your comment here: insert random test comment here
---Your input comment is classified as nonaggressive!---
