In [119]:
from numpy import array
from sklearn.preprocessing import StandardScaler
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import punkt

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
stop_words = stopwords.words('english')

In [120]:
def load_MeToo_data(path='MeTooMMD_train.csv'):
	return pd.read_csv(path)
MeToo=load_MeToo_data()

print(MeToo.shape)

(7978, 11)


In [121]:
comments=pd.read_csv('Tweet_cmt.csv')
MeToo = MeToo[MeToo['TweetId'].isin(comments['Id'])]

print(MeToo.shape)
print(comments.shape)

(6867, 11)
(6867, 2)


In [122]:
MeToo_labels = MeToo[["Text_Only_Informative", "Image_Only_Informative", "Directed_Hate",\
                         "Generalized_Hate", "Sarcasm", "Allegation", "Justification", "Refutation", \
                        "Support", "Oppose"]]
MeToo_labels.shape

class_names = ["Text_Only_Informative", "Image_Only_Informative", "Directed_Hate",\
                         "Generalized_Hate", "Sarcasm", "Allegation", "Justification", "Refutation", \
                        "Support", "Oppose"]

In [123]:
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence
X = []
sentences = list(comments["Comments"])
for sen in sentences:
    X.append(preprocess_text(sen))

y = MeToo_labels.values


In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [125]:
target_output=pd.read_csv('target.csv')

In [127]:
from zeugma.embeddings import EmbeddingTransformer

submission=pd.DataFrame(MeToo, columns = [["Text_Only_Informative", "Image_Only_Informative", "Directed_Hate",\
                         "Generalized_Hate", "Sarcasm", "Allegation", "Justification", "Refutation", \
                        "Support", "Oppose"]])
glove = EmbeddingTransformer('glove')
X_train = glove.transform(X_train)
for class_name in class_names:
    train_target=target_output[class_name]
    model = LogisticRegression(solver='sag')
    model.fit(X_train, train_target)
    print('Training LogisticRegression Classifier for {} is complete!!'.format(class_name))
X_test = glove.transform(X_test)
sumbission=model.predict(X_test)
submission.to_csv('submission_LogisticRegression.csv', index=False)


Training LogisticRegression Classifier for Text_Only_Informative is complete!!
Training LogisticRegression Classifier for Image_Only_Informative is complete!!
Training LogisticRegression Classifier for Directed_Hate is complete!!
Training LogisticRegression Classifier for Generalized_Hate is complete!!
Training LogisticRegression Classifier for Sarcasm is complete!!
Training LogisticRegression Classifier for Allegation is complete!!
Training LogisticRegression Classifier for Justification is complete!!
Training LogisticRegression Classifier for Refutation is complete!!
Training LogisticRegression Classifier for Support is complete!!
Training LogisticRegression Classifier for Oppose is complete!!


In [128]:
from tqdm import tqdm

embeddings_index = {}
f = open('/media/mohsin/New Volume/Downloads/glove.6B/glove.6B.300d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except ValueError:
        pass
f.close()
print('Found %s word vectors.' % len(embeddings_index))
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(X_train)]
xtest_glove = [sent2vec(x) for x in tqdm(X_test)]

#print('Checkpoint2 -Normalized Vector for Sentences are created')

xtrain_glove = np.array(xtrain_glove)
xtest_glove = np.array(xtest_glove)

400000it [00:45, 8708.44it/s]
  1%|▏         | 74/5493 [00:00<08:10, 11.06it/s]

Found 400000 word vectors.


100%|██████████| 5493/5493 [00:06<00:00, 788.39it/s]
100%|██████████| 1374/1374 [00:01<00:00, 809.06it/s]


In [None]:
scores = []
#submission = pd.DataFrame.from_dict({'id': test['id']})

submission=pd.DataFrame(target_output, columns = [["Text_Only_Informative", "Image_Only_Informative", "Directed_Hate",\
                         "Generalized_Hate", "Sarcasm", "Allegation", "Justification", "Refutation", \
                        "Support", "Oppose"]])

for class_name in class_names:
    
    train_target=target_output[class_name]
    classifier = LogisticRegression(solver='sag')
    classifier.fit(xtrain_glove, train_target)
    print('Training LogisticRegression Classifier for {} is complete!!'.format(class_name))
    
    submission = classifier.predict_proba(xtest_glove)

print('Total CV score is {}'.format(np.mean(scores)))

#submission.to_csv('submission_LogisticRegression.csv', index=False)


In [133]:
submission=pd.DataFrame(MeToo, columns = [["Text_Only_Informative", "Image_Only_Informative", "Directed_Hate",\
                         "Generalized_Hate", "Sarcasm", "Allegation", "Justification", "Refutation", \
                        "Support", "Oppose"]])

for class_name in class_names:
    train_target=target_output[class_name]
    classifier = LogisticRegression(solver='sag')
    classifier.fit(xtrain_glove, train_target)
    print('Training LogisticRegression Classifier for {} is complete!!'.format(class_name))
    
submission = classifier.predict_proba(xtest_glove)

#submission.to_csv('submission_LogisticRegression.csv', index=False)

Training LogisticRegression Classifier for Text_Only_Informative is complete!!
Training LogisticRegression Classifier for Image_Only_Informative is complete!!
Training LogisticRegression Classifier for Directed_Hate is complete!!
Training LogisticRegression Classifier for Generalized_Hate is complete!!
Training LogisticRegression Classifier for Sarcasm is complete!!
Training LogisticRegression Classifier for Allegation is complete!!
Training LogisticRegression Classifier for Justification is complete!!
Training LogisticRegression Classifier for Refutation is complete!!
Training LogisticRegression Classifier for Support is complete!!
Training LogisticRegression Classifier for Oppose is complete!!
