In [1]:
#tokenize the word
##import libraries
import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
##tokenize words without punctuation and change to lower characters
def tokenize_wo_puctuation(str):
  tokenizer = RegexpTokenizer(r"\w+")
  tokenized_sent = tokenizer.tokenize(str.lower())
  return tokenized_sent

#removing the stopwords
##import libraries
from nltk.corpus import stopwords
nltk.download('stopwords')
##remove stopwords from tokenized_sent
def removeStopwords(tokenized_sent):
  stop_words = set(stopwords.words("english"))
  filtered_sent=[]
  for w in tokenized_sent:
    if w not in stop_words:
      filtered_sent.append(w)
  return filtered_sent

#lemmatization
##import libraries
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
def Lemma(filtered_sent):
  lem = WordNetLemmatizer()
  lemmatized_sent=[]
  for w in filtered_sent:
    lemmatized_sent.append(lem.lemmatize(w))
  return lemmatized_sent


#function for preprocessing
def preprocessing(str):
  tokenized_sent = tokenize_wo_puctuation(str)
  filtered_sent = removeStopwords(tokenized_sent)
  lemmatized_sent = Lemma(filtered_sent)
  return lemmatized_sent

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
#function for evaluating 
##import modules
from sklearn.model_selection import cross_val_score
##evaluation by f1 score with 5_cross validation and output mean value.
def evaluation(model_name, X_test_data, y_test_data):
  scores = cross_val_score(model_name, X_test_data, y_test_data, scoring="f1_macro")
  return np.mean(scores)

In [0]:
#load the train data
##import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
##load the data
df = pd.read_csv("/train_data.csv")
df["Text Label"] = df["Text Label"].map({'Bullying': 1, 'Non-Bullying': 0}) #convert "Bullying" into 1, "Non-Bullying" into 0
df.head()

##store to each data
#X_train = df["Tweet"].values
#y_train = df["Text Label"].values
X_train,X_test,y_train,y_test = train_test_split(df["Tweet"].values, df["Text Label"].values, test_size=.2)

In [0]:
#bag-of-words
##import libraries
from sklearn.feature_extraction.text import CountVectorizer
##create vector. The # of {Rows=# of document in the whole document collection}, {Columns=# of unique tokens in the whole document collection}
bow_vect = CountVectorizer(lowercase=False, tokenizer=preprocessing)
bow_train = bow_vect.fit_transform(X_train)
bow_test = bow_vect.transform(X_test)

In [0]:
#tf-idf
##import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
##create vector
tfidf_vect = TfidfVectorizer(lowercase=False, tokenizer=preprocessing)
tfidf_train = tfidf_vect.fit_transform(X_train)
tfidf_test = tfidf_vect.transform(X_test)

In [6]:
#Naive Bayes Classifier with bow
##import library
from sklearn.naive_bayes import MultinomialNB
##train the model
bow_nb = MultinomialNB()
bow_nb.fit(bow_train, y_train)


#Naive Bayes Classifier with tf-idf
##train the model
tfidf_nb = MultinomialNB()
tfidf_nb.fit(tfidf_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
#Logistic Regression with bow
##import library
from sklearn.linear_model import LogisticRegression
##train the model
bow_lr = LogisticRegression()
bow_lr.fit(bow_train, y_train)


#Logistic Regression with tf-idf
##import library
from sklearn.linear_model import LogisticRegression
##train the model
tfidf_lr = LogisticRegression()
tfidf_lr.fit(tfidf_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
#bag-of-words with 2-grams
##create vector. The # of {Rows=# of document in the whole document collection}, {Columns=# of unique tokens in the whole document collection}
bow_bi_vect = CountVectorizer(lowercase=False, tokenizer=preprocessing, ngram_range=(2,2))
bow_bi_train = bow_bi_vect.fit_transform(X_train)
bow_bi_test = bow_bi_vect.transform(X_test)


#tf-idf with 2-grams
##create vector
tfidf_bi_vect = TfidfVectorizer(lowercase=False, tokenizer=preprocessing, ngram_range=(2,2))
tfidf_bi_train = tfidf_bi_vect.fit_transform(X_train)
tfidf_bi_test = tfidf_bi_vect.transform(X_test)

In [9]:
#Naive Bayes Classifier with bow of 2-grams
##train the model
bow_bi_nb = MultinomialNB()
bow_bi_nb.fit(bow_bi_train, y_train)


#Naive Bayes Classifier with tf-idf of 2-grams
##train the model
tfidf_bi_nb = MultinomialNB()
tfidf_bi_nb.fit(tfidf_bi_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
#Logistic Regression with bow of 2_grams
##train the model
bow_bi_lr = LogisticRegression()
bow_bi_lr.fit(bow_bi_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
#the function to create the list of tweets
def Tweets_lis(tweets):
  labelized = []
  for i in range(tweets.shape[0]):
    labelized.append(preprocessing(tweets[i]))
  return labelized

X_train_lis = Tweets_lis(X_train)
X_test_lis = Tweets_lis(X_test)

In [0]:
#build the Word2Vec model
from gensim.models import Word2Vec

w2v = Word2Vec(X_train_lis, size=100, window=5, min_count=2, sg=1, hs=0) 

In [0]:
#take mean of all the word vecotrs present in the tweets. The length will be 200.
def word_vector(tokens, size):
  vec = np.zeros(size).reshape((1, size))
  c = 0
  for word in tokens:
    try:
      vec += w2v[word].reshape((1,size))
      c += 1
    except KeyError:
      continue
    if c != 0:
      vec /= c
    return vec

In [0]:
#import module
import numpy as np
##create the new word2vec whose shape is (# of itself, 200).
def createWord2veclis(X_train_lis):
  wordvec_arrays = np.zeros((len(X_train_lis), 100))
  for i in range(len(X_train_lis)):
    wordvec_arrays[i,:] = word_vector(X_train_lis[i], 100)
  wordvec_df = pd.DataFrame(wordvec_arrays)
  return wordvec_df

In [0]:
#the function to remove the function which includes Nan. Also, create new y value for word2vec.
def removeNanRow(X_data, y_data):
  y_arrays = np.zeros((len(y_data), 1))
  for i in range(len(y_data)):
    y_arrays[i,0] = y_data[i]

  y_data = pd.DataFrame(y_arrays)

  Index = X_data[X_data.isnull().any(axis=1)].index
  X_data=X_data.drop(Index)
  y_data=y_data.drop(Index)
  return X_data, y_data

In [16]:
#create word2vec train and test 
wordvec_df_train = createWord2veclis(X_train_lis)
wordvec_df_test = createWord2veclis(X_test_lis)
w2v_x_train, w2v_y_train = removeNanRow(wordvec_df_train, y_train)
w2v_x_test, w2v_y_test = removeNanRow(wordvec_df_test, y_test)

  import sys


In [17]:
#Logistic Regression with Word2Vec
##train the model
w2v_lr = LogisticRegression()
w2v_lr.fit(w2v_x_train, w2v_y_train)
#w2v_lr.score(w2v_x_test, w2v_y_test)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
#Support Vector Machine with bow
##import the module
from sklearn.svm import SVC
##train the model
bow_SVM = SVC()
bow_SVM.fit(bow_train, y_train)


#Support Vector Machine with tfidf
##train the model
tfidf_SVM = SVC()
tfidf_SVM.fit(tfidf_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [19]:
#nerural network with bow
##import the module
from sklearn.neural_network import MLPClassifier
##train the model
bow_mlp = MLPClassifier()
bow_mlp.fit(bow_train, y_train)

#nerural network with tf-idf
tfidf_mlp = MLPClassifier()
tfidf_mlp.fit(tfidf_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [20]:
print("Naive Bayes Classifier with bow(Bag-of-words): ", evaluation(bow_nb, bow_test, y_test))
print("Naive Bayes Classifier with tf-idf(term frequency and inverse document frequency):", evaluation(tfidf_nb, tfidf_test, y_test))
print("Logistic Regression with bow(Bag-of-words): ", evaluation(bow_lr, bow_test, y_test))
print("Logistic Regression with tf-idf(term frequency and inverse document frequency): ", evaluation(tfidf_lr, tfidf_test, y_test))

print("Naive Bayes Classifier with bow(Bag-of-words with bigrams): ", evaluation(bow_bi_nb, bow_bi_test, y_test))
print("Logistic Regression with bow(Bag-of-words with bigrams): ", evaluation(bow_bi_lr, bow_bi_test, y_test))
print("Logistic Regression with word2vec: ", evaluation(w2v_lr, w2v_x_test, w2v_y_test))

print("Support Vector Machine with bow: ", evaluation(bow_SVM, bow_test, y_test))
print("Support Vector Machine with tf-idf ", evaluation(tfidf_SVM, tfidf_test, y_test))

print("Neural Network with bow: ", evaluation(bow_mlp, bow_test, y_test))
print("Neural Network with tf-idf ", evaluation(tfidf_mlp, tfidf_test, y_test))

Naive Bayes Classifier with bow(Bag-of-words):  0.7135005665788313
Naive Bayes Classifier with tf-idf(term frequency and inverse document frequency): 0.6477965688465223
Logistic Regression with bow(Bag-of-words):  0.7252194851639914
Logistic Regression with tf-idf(term frequency and inverse document frequency):  0.6301903404527293
Naive Bayes Classifier with bow(Bag-of-words with bigrams):  0.4610397241647243
Logistic Regression with bow(Bag-of-words with bigrams):  0.4610397241647243
Logistic Regression with word2vec:  0.35471871412169925
Support Vector Machine with bow:  0.6221553460751836
Support Vector Machine with tf-idf  0.567646867096558


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Neural Network with bow:  0.6324933614528879




Neural Network with tf-idf  0.6548685646801149




In [0]:
#Use 3 models, Logistic Regression with bow, SVM with bow, Neural Network with bow.
#This is the final model
def detection_machine(inp):
  InputLis = []
  InputLis.append(inp)
  InputVect = bow_vect.transform(InputLis)

  lr_result = bow_lr.predict(InputVect)
  svm_result = bow_SVM.predict(InputVect)
  nn_result = bow_mlp.predict(InputVect)
  
  sum = lr_result+svm_result+nn_result

  if(sum >= 2):
    print("Bullying")
  else:
    print("Non-Bullying")

In [22]:
#trial
b1 = "Oh, please shut the fuck up Alecia"
b2 = "Fuck Justin bieber. He's a fucking UM excuse yuh? HE IS NOT!! Ur a fuckin idiot! Kay? good"
b3 = "Hey. Why you such a bitch?"
b4 = "Now how the fuck am i suppose to remember that? Weirdo! maybe it was ur moms dick? lOL!"
b5 = "who decided to take a shit on your face?"

n1 = "Who do you think should be the next president of the United States?"
n2 = "Who is the funniest person you know?"
n3 = "would u ever go bungee jumping like justin did?"

print(detection_machine(b1))
print(detection_machine(b2))
print(detection_machine(b3))
print(detection_machine(b4))
print(detection_machine(b5))
print(detection_machine(n1))
print(detection_machine(n2))
print(detection_machine(n3))

Bullying
None
Bullying
None
Bullying
None
Bullying
None
Bullying
None
Non-Bullying
None
Non-Bullying
None
Non-Bullying
None
