### Import Packages

In [None]:
import pandas as pd
import numpy as np
import time
import string
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

### Loading Training Data and  Testing Data

In [None]:
df_train = pd.read_csv('train.csv')
labels = list(df_train.columns[2:])
df_labels = df_train[labels].copy()

print('Training labels:')
print(df_labels.columns)
print(df_labels.shape) #six labels

print('\nTraining data:')
df_train.drop(list(df_labels.columns),inplace=True, axis=1)
df_train.drop('id',inplace=True,axis=1)
print(list(df_train.columns))
print(df_train.shape)

df_test = pd.read_csv('test.csv')
print(df_test.columns)
print(df_test.shape) #six labels

### Feature Engineering 

In [None]:
#Reference: https://jayspeidell.github.io/portfolio/project05-toxic-comments/
def caps(s):
    isalpha = (sum([1 for c in s if c.isalpha()]))
    if isalpha == 0:
        return 0
    else:
        return sum([1 for c in s if c.isupper()])/isalpha
def word_length(s):
    if len(s) == 0:
        return 0
    else:
        s = s.split(' ')
        return np.mean([len(x) for x in s if x.isalpha()])
def strip_ip(s):
    temp = ip.search(s)
    if temp:
        return s.replace(temp.group(),' ')
    return s
def extract_features(df_train):
    df_train['length'] = df_train.comment_text.apply(lambda x: len(str(x))) # the length of the comments
    df_train['caps'] = df_train.comment_text.apply(lambda x: caps(str(x))) # the capitalization percent
    #df_train['word_length'] = df_train.comment_text.apply(lambda x: word_length(str(x)))# the average word length
    df_train['exclamation'] = df_train.comment_text.apply(lambda s: len([c for c in str(s) if c == '!']))# the average number of exclamation points
    df_train['question'] = df_train.comment_text.apply(lambda s: len([c for c in str(s) if c == '?']))# the average number of question marks
    #Normalization
    for feature in ['length','caps','word_length','exclamation','question']:
        diff  = max(df_train[feature]) - min(df_train[feature])
        df_train[feature] = df_train[feature].apply(lambda x: (x-min(df_train[feature]))/diff)
    ip = re.compile('(([2][5][0-5]\.)|([2][0-4][0-9]\.)|([0-1]?[0-9]?[0-9]\.)){3}'
                    +'(([2][5][0-5])|([2][0-4][0-9])|([0-1]?[0-9]?[0-9]))')
    df_train['comment_text'] = df_train.comment_text.apply(lambda x: strip_ip(x,ip))
    return df_train

In [None]:
df_train = extract_features(df_train)
print(list(df_train.columns))
df_test = extract_features(df_test)
print(list(df_test.columns))

### Vectorizing text

In [None]:
start = time.time()
comment_vector = TfidfVectorizer(max_features=10000, analyzer='word', stop_words='english')
training_comments = comment_vector.fit_transform(df_train[comment_text])
testing_comments = comment_vector.fit_transform(df_test[comment_text])
print(time)
print(training_comments)
print(testing_comments)

### Logistic Regression

In [None]:
for label in labels:
    lr = LogisticRegression(random_state = 42)
    print(label + ' score: %.4f' % np.mean(cross_val_score(lr, training_comments, df_labels[label], scoring='f1', cv=cv)))

### Bayes

In [None]:
model = MultinomialNB(alpha=1.0)
_ = multi_cv(model, training_comments, df_labels)

### SVM

In [None]:
model = LinearSVC(random_state=seed)
_ = multi_cv(model, training_comments, df_labels)