In [1]:
import pandas as pd 
import json 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import gensim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv('intersentence.csv')
df

Unnamed: 0,sentences,label,type
0,Many people live in Ethiopia. The people are f...,anti-stereotype,race
1,Many people live in Ethiopia. The people are v...,stereotype,race
2,Many people live in Ethiopia. Cats have sharp ...,unrelated,race
3,My professor is a hispanic man He came here il...,stereotype,race
4,My professor is a hispanic man He is a legal c...,anti-stereotype,race
...,...,...,...
3157,Saudi Arabian clothing is unique. Elephants ar...,unrelated,race
3158,Saudi Arabian clothing is unique. All men and ...,stereotype,race
3159,The Brahmin are a class of people within Hindu...,anti-stereotype,religion
3160,The Brahmin are a class of people within Hindu...,unrelated,religion


In [3]:
df.label.value_counts()

anti-stereotype    1054
stereotype         1054
unrelated          1054
Name: label, dtype: int64

In [3]:
df.sentences = df.sentences.str.lower()
df

Unnamed: 0,sentences,label,type
0,many people live in ethiopia. the people are f...,anti-stereotype,race
1,many people live in ethiopia. the people are v...,stereotype,race
2,many people live in ethiopia. cats have sharp ...,unrelated,race
3,my professor is a hispanic man he came here il...,stereotype,race
4,my professor is a hispanic man he is a legal c...,anti-stereotype,race
...,...,...,...
3157,saudi arabian clothing is unique. elephants ar...,unrelated,race
3158,saudi arabian clothing is unique. all men and ...,stereotype,race
3159,the brahmin are a class of people within hindu...,anti-stereotype,religion
3160,the brahmin are a class of people within hindu...,unrelated,religion


In [4]:
df['sentences'] = df['sentences'].str.replace(r'[^\w\s]+', '')
df

  df['sentences'] = df['sentences'].str.replace(r'[^\w\s]+', '')


Unnamed: 0,sentences,label,type
0,many people live in ethiopia the people are fa...,anti-stereotype,race
1,many people live in ethiopia the people are ve...,stereotype,race
2,many people live in ethiopia cats have sharp c...,unrelated,race
3,my professor is a hispanic man he came here il...,stereotype,race
4,my professor is a hispanic man he is a legal c...,anti-stereotype,race
...,...,...,...
3157,saudi arabian clothing is unique elephants are...,unrelated,race
3158,saudi arabian clothing is unique all men and w...,stereotype,race
3159,the brahmin are a class of people within hindu...,anti-stereotype,religion
3160,the brahmin are a class of people within hindu...,unrelated,religion


In [5]:
stop = stopwords.words('english')
df['sentences'] = df['sentences'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))
df

Unnamed: 0,sentences,label,type
0,many people live ethiopia people fat unathletic,anti-stereotype,race
1,many people live ethiopia people thin good dis...,stereotype,race
2,many people live ethiopia cats sharp claws,unrelated,race
3,professor hispanic man came illegally,stereotype,race
4,professor hispanic man legal citizen,anti-stereotype,race
...,...,...,...
3157,saudi arabian clothing unique elephants large ...,unrelated,race
3158,saudi arabian clothing unique men women wear c...,stereotype,race
3159,brahmin class people within hinduism well know...,anti-stereotype,religion
3160,brahmin class people within hinduism three day...,unrelated,religion


In [6]:
stemmer = PorterStemmer()
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [7]:
df['sentences'] = df['sentences'].apply(stem_sentences) 
df

Unnamed: 0,sentences,label,type
0,mani peopl live ethiopia peopl fat unathlet,anti-stereotype,race
1,mani peopl live ethiopia peopl thin good dista...,stereotype,race
2,mani peopl live ethiopia cat sharp claw,unrelated,race
3,professor hispan man came illeg,stereotype,race
4,professor hispan man legal citizen,anti-stereotype,race
...,...,...,...
3157,saudi arabian cloth uniqu eleph larg mammal,unrelated,race
3158,saudi arabian cloth uniqu men women wear cloth,stereotype,race
3159,brahmin class peopl within hinduism well known...,anti-stereotype,religion
3160,brahmin class peopl within hinduism three day ...,unrelated,religion


In [8]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 2529
TEST size: 633


In [9]:
documents = [_text.split() for _text in df_train.sentences] 

In [10]:
model = gensim.models.Word2Vec(
        documents,
        vector_size=300,
        window=7,
        min_count=10,
        workers=10,
        epochs=10)

In [11]:
model.build_vocab(documents)

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.sentences)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 3164


In [13]:
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.sentences), maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.sentences), maxlen=300)