# INTRO
This notebook contains experiments for:


1.   Proposed model - **Text-to-graph + Aggregate rule**
2.   Baseline models:
  
  2.1 Models - **SGDClassifier, KNeighborsClassifier, GaussianNB**

  2.2 Feature engeneering - **word n-gram, Word2Vec, FastText, LIWC**

To run part additional model for amrlib must be dowloaded and unarchived - https://github.com/bjascob/amrlib-models/releases/download/parse_xfm_bart_base-v0_1_0/model_parse_xfm_bart_base-v0_1_0.tar.gz

To execute this code faster, the use of GPU on  is recommended.



In [None]:
!pip install amrlib
!wget https://github.com/bjascob/amrlib-models/releases/download/parse_xfm_bart_base-v0_1_0/model_parse_xfm_bart_base-v0_1_0.tar.gz
!tar xzf model_parse_xfm_bart_base-v0_1_0.tar.gz
!pip install Unidecode
!pip install graphviz
!pip install emoji
!pip install liwc

In [None]:
import pandas as pd
import numpy as np
import random
import pickle
import amrlib
import re
import nltk
import penman
import unicodedata
from tqdm import tqdm
import string
import emoji
import graphviz
import logging
import gensim
import spacy
import warnings
import liwc

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim import corpora

from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from collections import Counter

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

stop_eng = stopwords.words('english')

warnings.filterwarnings("ignore")
logging.disable(level=logging.CRITICAL)

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
df = pd.read_excel("Threat_translated_into_English.xlsx")
df = df.drop(["Tweets", "Unnamed: 0"], axis=1)
df = df.rename(columns={"Translated": "Text", "label": "Label"})
df.head()

# Text-to-graph + Aggregate rule

## Data preparatin

In this section exmaples from dataset are prepared to be used in gSofia:


1.   Examples are devided into sentences
2.   For each sentence an AMR graph is build
3.   All instance entities and related links are removed from the graph. The names of all internal vertices are replaced with their corresponding (names used in) instance entities.
4.   Lemmatizing and lowering word in nodes.
5.   Merging all AMR graph of the tweet into one.



In [None]:
stog = amrlib.load_stog_model(model_dir="/content/model_parse_xfm_bart_base-v0_1_0",)
lemmatizer = WordNetLemmatizer()

In [None]:
"""
A method that runs through all the sentences
from the dataset and executes the DocToGraph from the paper
"""
def enterPoint(df):
    res = []
    for text in tqdm(df['Text']):
        res.append(DocToGraph(text))
    return res


In [None]:
"""
Makes all modifications to create the desired AMR graph
"""
def DocToGraph(text):
    r = []
    text = text.replace("\n", " ")
    sent = text.split(".")
    for s in sent:
        if len(s)==0:
                continue
        a = sentenceToAMRGraph(s)
        m = ModifyGraph(a)
        r.append(refineGraph(m))
    g = mergeGraphs(r)
    return g

In [None]:
"""
Translates a single sentence into a standard AMR graph
"""
def sentenceToAMRGraph(s):
    arr = [s]
    graphs = stog.parse_sents([s])
    return graphs[0]

In [None]:
"""
Lemmatisation of words in the graph
Lowercase words
"""
def refineGraph(m):
  for el in m:
    el[0] = lemmatizer.lemmatize(el[0].lower())
    el[2] = lemmatizer.lemmatize(el[2].lower())

  return m

In [None]:
def mergeGraphs(r):
  new_graph = []
  for graph in r:
    for el in graph:
      if el not in new_graph:
        new_graph.append(el)

  return new_graph

In [None]:
"""
Modifies the graph:
1. Finds all duplicates of a single object
2. Replaces duplicates with the original one
Example, orignal -
:ARG0 (p / person) # 3
                  :ARG1 (p2 / person # 4
                        :mod (h / hypocrite)) # 5
                  :manner (l / live-01 # 6
                        :ARG0 p2)
after modification -
:ARG0 (person) # 3
                  :ARG1 (person2 # 4
                        :mod (hypocrite)) # 5
                  :manner (live-01 # 6
                        :ARG0 person2)

"""
def ModifyGraph(a):
  g = penman.decode(a)
  arr_trip = g.triples
  dict_inst = dict()
  new_graph = []

  for el in arr_trip:
    if ':instance' in el:
      dict_inst[el[0]] = el[2]
  for el in arr_trip:
      if ':instance' not in el:
        try:
          new_graph.append([dict_inst[el[0]], el[1], dict_inst[el[2]]])
        except KeyError:
          if el[0] not in dict_inst:
            dict_inst[el[0]] = el[0]
          if el[2] not in dict_inst:
            dict_inst[el[2]] = el[2]
          new_graph.append([dict_inst[el[0]], el[1], dict_inst[el[2]]])
  return new_graph



In [None]:
def save_unique_id(dict_num_all):
  with open('dict_vocab.txt', 'wb') as fp:
    pickle.dump(dict_num_all, fp)
    print('dictionary saved successfully to file')

def save_graph(fin_str, name):
  with open(name, 'w') as text_file:
    text_file.write(fin_str)
    print('Graphs saved successfully to file')

"""
all_g - array of all graphs threat or not-trea separately
dict_num_all - dict of unique id for elements in all graphs
i - unique number for all words in all graphs
"""
def prepareForGSofia(all_g, dict_num_all, i):
  graph_num=1
  fin_str = ""

  for g in all_g: # run all graphs
    fin_str = fin_str + "t # "+str(graph_num)+"\n"
    graph_num+=1
    dict_num = {} # unique id for elements in one graphs
    j = 0
    graph_str = ""

    for el in g: #get all vertices
      if el[0] not in dict_num: # number for source inside the graph for all in one graph
        dict_num[el[0]] = j
        j+=1
      if el[0] not in dict_num_all:
        dict_num_all[el[0]] = i
        i+=1


      if el[2] not in dict_num: # number for target inside the graph for all or one graph
        dict_num[el[2]] = j
        j+=1
      if el[2] not in dict_num_all:
        dict_num_all[el[2]] = i
        i+=1

      if el[1] not in dict_num_all:# number for type inside the graph for all graphs
        dict_num_all[el[1]] = i
        i+=1

      str_edge = "v "+ str(dict_num[el[0]]) +" "+ str(dict_num_all[el[0]])+"\n"
      if str_edge not in graph_str:
        graph_str=graph_str+str_edge
      str_edge = "v "+ str(dict_num[el[2]]) + " "+ str(dict_num_all[el[2]])+"\n"
      if str_edge not in graph_str:
        graph_str=graph_str+str_edge

    fin_str=fin_str+graph_str

    for el in g: # run all edges
      str_edge = "e " +str(dict_num[el[0]]) +" "+str(dict_num[el[2]])+" "+str(dict_num_all[el[1]])+"\n"
      fin_str=fin_str+str_edge

  return fin_str, dict_num_all, i


In [None]:
#Create a train:test split
split=StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
X = df
y =  df['Label']
X_train, y_train = None, None
X_test, y_test = None, None
for i, (train_index, test_index) in enumerate(split.split( X, y)):
     X_train, y_train = X.iloc[train_index], y.iloc[train_index]
     X_test, y_test = X.iloc[test_index], y.iloc[test_index]

In [None]:
dict_num_all = {}
i = 0

In [None]:
threat_speech_train = enterPoint(X_train[X_train['Label']==1])
fin_str, dict_num_all, i = prepareForGSofia(threat_speech_train, dict_num_all, i)
save_graph(fin_str, "threat_train_90.txt")

threat_speech_test = enterPoint(X_test[X_test['Label']==1])
fin_str, dict_num_all, i = prepareForGSofia(threat_speech_test, dict_num_all, i)
save_graph(fin_str, "threat_test_90.txt")

In [None]:
non_threat_speech_train = enterPoint(X_train[X_train['Label']==0])
fin_str, dict_num_all, i = prepareForGSofia(non_threat_speech_train, dict_num_all, i)
save_graph(fin_str, "non_threat_train_90.txt")

non_threat_speech_test = enterPoint(X_test[X_test['Label']==0])
fin_str, dict_num_all, i = prepareForGSofia(non_threat_speech_test, dict_num_all, i)
save_graph(fin_str, "non_threat_test_90.txt")

In [None]:
save_unique_id(dict_num_all)

## Classification

In this section graph patterns from gSofia are used to classify test examples into classes.

In [None]:
"""
Read files produced by gSofia
"""
def read_file(file_name):
    lines = []
    with open(file_name) as file:
        lines = file.read()

    res_patters = []
    # res_nodes = {}
    cur_pat=0
    sup_patt_count = []

    lines = lines.replace('#', "")
    # get all paths
    pattern_arr = re.split(r't  [0-9]+\n',lines)

    all_graphs = [] # all patters for one class

    for pattern in pattern_arr:

        line = pattern.split("\n")
        cur_edges = []
        cur_num = {}
        if len(line)<=2:
            continue
        for l in line:
            #vertice is found
            if "v" in l:
                sym = l.split(' ')
                # res_nodes[sym[2]]=sym[1]
                cur_num[sym[1]]=sym[2]
            # edge is found
            if "e" in l:
                sym = l.split(' ')
                # [node1, node2, edge]
                edg = [int(cur_num[sym[1]]), int(cur_num[sym[2]]), int(sym[3])]
                cur_edges.append(edg)
            #examples in which this path was found
            if "x" in l:
              sup = l.split(" ")

              sup_patt_count.append(len(sup[1:]))
              cur_pat+=1

        all_graphs.append(cur_edges)

    return all_graphs, sup_patt_count


In [None]:
"""
Check if train pattern subsumes test
"""
def check_if_sumbsume(train, test):
  count = 0
  for tr in train:
    for t in test:
      if (t[0]==tr[0] or t[1]==tr[0]) and (t[0]==tr[1] or t[1]==tr[1]) and (t[2]==tr[2]):
        count+=1

  if count == len(train):
    return True
  else:
    return False

"""
Clacluate penalty fo the test example
"""
def get_pen(test, patterns_train_wrong, count_wrong_arr):
  res=0
  for i in range(len(patterns_train_wrong)):
    if check_if_sumbsume(patterns_train_wrong[i], test):
      res+=count_wrong_arr[i]
  if res==0:
    return 1
  else:
    return res

"""
Clacluta both support and penalty of test example for specific class
"""
def sup_pen(test, train, patterns_train_wrong, count_right, count_wrong_arr):
  sup = 0
  pen = 1

  if check_if_sumbsume(train, test)==False:
    return sup, pen

  sup = count_right
  pen = get_pen(test, patterns_train_wrong, count_wrong_arr)

  return sup, pen

In [None]:
def agg_score(test_patters, patterns_train_right, patterns_train_wrong, count_right_arr, count_wrong_arr):
  L_c = sum(count_right_arr)
  max_contr_val_all = []
  max_contr_pat_all = []
  agg_score_all = []

  for test in test_patters:
    agg_score = 0
    max_contr_val = -1
    max_contr_pat = None
    for i in range(len(patterns_train_right)):
    # for train in patterns_train_right:
      size_train_i = len(patterns_train_right[i])
      support_i, penalty_i = sup_pen(test, patterns_train_right[i], patterns_train_wrong, count_right_arr[i], count_wrong_arr)
      contr_i = 1/L_c * (support_i*size_train_i)/penalty_i

      if contr_i>max_contr_val:
        max_contr_val = contr_i
        max_contr_pat = patterns_train_right[i]

      agg_score += (support_i*size_train_i)/penalty_i

    agg_score_all.append( (1/L_c) * agg_score)
    max_contr_val_all.append(max_contr_val)
    max_contr_pat_all.append(max_contr_pat)

  return agg_score_all, max_contr_val_all, max_contr_pat_all

In [None]:
t_graphs, t_sup_patt_count = read_file("PATTERNS_THREAT43.OUT")
nt_graphs, nt_sup_patt_count = read_file("PATTERNS_NONTHREAT22.OUT")
t_test_graph,  _ = read_file("threat_test.txt")
nt_test_graph,  _ = read_file("non_threat_test.txt")

import pickle
with open('dict_vocab.txt', 'rb') as fp:
    dict_id = pickle.load(fp)

In [None]:
agg_score_threat, max_contr_val_threat, max_contr_pat_threat = agg_score(t_test_graph, t_graphs, nt_graphs, t_sup_patt_count, nt_sup_patt_count)
agg_score_nonthreat, max_contr_val_nonthreat, max_contr_pat_nonthreat = agg_score(nt_test_graph, nt_graphs, t_graphs, nt_sup_patt_count, t_sup_patt_count)

In [None]:
agg_score_threat_wrong, max_contr_val_threat_wrong, max_contr_pat_threat_wrong = agg_score(t_test_graph, nt_graphs, t_graphs, nt_sup_patt_count, t_sup_patt_count)
agg_score_nonthreat_wrong, max_contr_val_nonthreat_wrong, max_contr_pat_nonthreat_wrong = agg_score(nt_test_graph, t_graphs, nt_graphs, t_sup_patt_count, nt_sup_patt_count)

In [None]:
df_threat = pd.DataFrame(
    {'Agg_right': agg_score_threat,
     'Contr_right': max_contr_val_threat,
     'Patt_right': max_contr_pat_threat,
     'Agg_wrong': agg_score_threat_wrong,
     'Contr_wrong': max_contr_val_threat_wrong,
     'Patt_wrong': max_contr_pat_threat_wrong,
    })
df_threat['isToxic'] = np.where(df_threat['Agg_right']> df_threat['Agg_wrong'], 1, 0)
df_threat['cl'] = 1

In [None]:
df_nonthreat = pd.DataFrame(
    {'Agg_right': agg_score_nonthreat,
     'Contr_right': max_contr_val_nonthreat,
     'Patt_right': max_contr_pat_nonthreat,
     'Agg_wrong': agg_score_nonthreat_wrong,
     'Contr_wrong': max_contr_val_nonthreat_wrong,
     'Patt_wrong': max_contr_pat_nonthreat_wrong,
    })
df_nonthreat['isToxic'] = np.where(df_nonthreat['Agg_right']> df_nonthreat['Agg_wrong'], 0, 1)
df_nonthreat['cl'] = 0

In [None]:
def save_image(df_row, file_name, comm, cl):
  dot = graphviz.Digraph(comment=comm)
  patt = df_row['Patt_right']
  print(df_row)
  print(X_test[X_test['Label']==cl].iloc[df_row.name].Text)
  print("------------------------")
  for a in patt:
    dot.node(str(a[0]),list(dict_id.keys())[list(dict_id.values()).index(a[0])] )
    dot.node(str(a[1]),list(dict_id.keys())[list(dict_id.values()).index(a[1])])
    dot.edge(str(a[0]), str(a[1]), constraint='false', label=list(dict_id.keys())[list(dict_id.values()).index(a[2])])
  dot.render(file_name, view=True)

In [None]:
save_image(df_threat.sort_values(by=['Contr_right'], ascending=False).iloc[0], "threat_1.png", "Threat intermediate", 1)
save_image(df_threat.sort_values(by=['Contr_right'], ascending=False).iloc[1], "threat_2.png", "Threat intermediate", 1)
save_image(df_nonthreat.sort_values(by=['Contr_right'], ascending=False).iloc[0], "nonthreat_1.png", "Not Threat intermediate", 0)
save_image(df_nonthreat.sort_values(by=['Contr_right'], ascending=False).iloc[1], "nonthreat_2.png", "Not Threat intermediate", 0)

In [None]:
from sklearn.metrics import classification_report
threat_pred_list = df_threat[(df_threat['Agg_right']!=0) & (df_threat['Agg_wrong']!=0)]['isToxic'].tolist()
threat_size = len(threat_pred_list)
non_threat_pred_list = df_nonthreat[(df_nonthreat['Agg_right']!=0) & (df_nonthreat['Agg_wrong']!=0)]['isToxic'].tolist()
threat_pred_list.extend(non_threat_pred_list)
true = [1]*threat_size
true.extend([0]*len(non_threat_pred_list))
print(classification_report(true, threat_pred_list, digits=4))

# Baseline

In [None]:
def text_clean(text):
    text = emoji.demojize(text, language = "en")
    # to lower case
    text = text.lower()
    # remove mentions
    text = re.sub("@[A-Za-z0-9]+","", text)
    # remove hashtags
    text = re.sub("#[A-Za-z0-9_]+","", text)
    # remove links
    text = re.sub('https:\/\/\S+', '', text)
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    # remove next line
    text = re.sub(r'[^ \w\.]', '', text)
    # remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    # remove non latin words
    text = re.sub(r'[^\x00-\x7f]',r'', text)

    # replace some abbreviations
    text = text.replace('thnx', 'thanks')
    text = text.replace('thx', 'thanks')

    text = text.replace('btw', 'by the way')

    text = text.replace('pls', 'please')
    text = text.replace('plz', 'please')

    text = text.replace('imho', 'in my humble opinion')

    return text

In [None]:
df['Text'] = df['Text'].apply(lambda x: text_clean(x))
df['Text'] = df['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_eng)]))
df.head()

Unnamed: 0,Text,Label
0,mustaches cut use much power possible people u...,1
1,people karachi catch robbers burn alive whenev...,1
2,coward tries disqualify imran khan whether cow...,1
3,ready understand tomorrow kind revenge action ...,1
4,waiting four names mouth khan made country hon...,1


In [None]:
"""
Gnerating 90:10 split for train:test. The "is_unigram" is option for the Dataframes that have extra attributes
"""
def generate_split(df, is_unigram=True, test_size = 0.1):
  split = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
  X_train, y_train, X_test, y_test = None, None, None, None
  if is_unigram:
   X = df.drop(columns = ['Text', 'Label']).reset_index(drop=True)
  else:
    X = df.drop(columns = ['Label']).reset_index(drop=True)
  y = df['Label'].reset_index(drop=True)
  for i, (train_index, test_index) in enumerate(split.split(X, y)):
          X_train, y_train = X.iloc[train_index], y.iloc[train_index]
          X_test, y_test = X.iloc[test_index], y.iloc[test_index]
          y_train = y_train
          y_test = y_test

          X_train = X_train
          X_test = X_test

  return X_train, y_train, X_test, y_test

In [None]:
def print_res(X_train, y_train, X_test, y_test):
  sgd = SGDClassifier(loss="perceptron", eta0=1.0, learning_rate="constant", penalty=None, random_state=seed_val)
  sgd.fit(X_train, y_train)
  y_pred_sgd= sgd.predict(X_test)
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_sgd).ravel()
  print("SGDClassifier")
  print(classification_report(y_test, y_pred_sgd, digits=4))
  print(tp, tn, fp, fn)
  print('__________________')


  knn = KNeighborsClassifier()
  knn.fit(X_train, y_train)
  y_pred_knn = knn.predict(X_test)
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_knn).ravel()
  print("KNeighborsClassifier")
  print(classification_report(y_test, y_pred_knn, digits=4))
  print(tp, tn, fp, fn)
  print('__________________')

  nb = GaussianNB()
  nb.fit(X_train, y_train)
  y_pred_nb = nb.predict(X_test)
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_nb).ravel()
  print("GaussianNB")
  print(classification_report(y_test, y_pred_nb, digits=4))
  print(tp, tn, fp, fn)
  print('__________________')

## Word n-gram

In [None]:
def flat_list(unflat_list):
    flatted = [item for sublist in unflat_list for item in sublist]
    return flatted
"""
Turning DataFrame of Text into list, so we can make unigrams
"""
def to_list(df, attribute):
    df_transcription = df[[attribute]]
    unflat_list_transcription = df_transcription.values.tolist()
    flat_list_transcription = flat_list(unflat_list_transcription)
    return flat_list_transcription

def generate_n_gram_features(flat_list_transcription, df):
    vectorizer = CountVectorizer(ngram_range=(1,1))#unigram
    X = vectorizer.fit_transform(flat_list_transcription)
    count_vect_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
    df = pd.concat([df, count_vect_df], axis=1)
    return X, df

temp, df_features = generate_n_gram_features(to_list(df, 'Text'), df)

In [None]:
X_train, y_train, X_test, y_test = generate_split(df_features, test_size = 0.1)

In [None]:
print_res(X_train, y_train, X_test, y_test)

## Word2Vec

In [None]:
X_train, y_train, X_test, y_test = generate_split(df, is_unigram=False, test_size = 0.1)

In [None]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [None]:
words = X_train['Text'].apply(lambda x: x.split(" "))

w2v_model = gensim.models.Word2Vec(words, vector_size = 100, sg=1, window=1)
X_train_features = X_train['Text'].apply(vectorize)
X_test_features = X_test['Text'].apply(vectorize)


feature = [x for x in X_train_features.transpose()]
X_train_features = np.asarray(feature)

feature = [x for x in X_test_features.transpose()]
X_test_features = np.asarray(feature)

In [None]:
print_res(X_train_features, y_train, X_test_features, y_test)

## FastText

In [None]:
X_train, y_train, X_test, y_test = generate_split(df, is_unigram=False, test_size = 0.1)

In [None]:
corpus=[]
for i in X_train['Text'].values:
    corpus.append(str(i).split(" "))
model = gensim.models.FastText(corpus, vector_size=100, workers=4,window=5)

def vectorize_fattext(sentence):
    words = sentence.split()
    words_vecs = [model.wv[word] for word in words if word in model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

def get_features(X_train, X_test):
  X_train_features = X_train['Text'].apply(vectorize_fattext)
  X_test_features = X_test['Text'].apply(vectorize_fattext)


  feature = [x for x in X_train_features.transpose()]
  X_train_features = np.asarray(feature)

  feature = [x for x in X_test_features.transpose()]
  X_test_features = np.asarray(feature)

  return X_train_features, X_test_features

In [None]:
X_train_features, X_test_features = get_features(X_train, X_test)

In [None]:
print_res(X_train_features, y_train, X_test_features, y_test)

## LIWC

In [None]:
parse, category_names = liwc.load_token_parser('LIWC2007_English080730.dic')

In [None]:
def tokenize(text):
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)

def full_arr(category_names, counts, full_len):
  res = []
  for c in category_names:
    res.append(counts[c])

  return res

In [None]:
res = []
for i in range(len(df)):

  sen = df.loc[i].Text
  help = [sen]

  tokens = tokenize(sen)
  counts = Counter(category for token in tokens for category in parse(token))

  help.extend(full_arr(category_names, counts, len(df.loc[i].Text.split(" "))))
  help.append(df.loc[i].Label)

  res.append(help)
columns_name = ["Text"]
columns_name.extend(category_names)
columns_name.append('Label')

df_new = pd.DataFrame(data = res, columns = columns_name )

In [None]:
X_train, y_train, X_test, y_test = generate_split(df_new, is_unigram=True, test_size = 0.1)

In [None]:
print_res(X_train, y_train, X_test, y_test)