In [1]:
import pandas as pd

In [6]:
import nltk
import spacy
nlp = spacy.load("en_core_web_sm")

In [12]:
from sklearn.model_selection import train_test_split

**Prepare data**

In [4]:
df = pd.read_csv("data/data.csv", sep=';')
df

Unnamed: 0,comment,isHate
0,You should know women's sports are a joke,1.0
1,You look like Sloth with deeper Down’s syndrome,1.0
2,You look like Russian and speak like Indian. B...,1.0
3,"Women deserve to be abused, I guess.",1.0
4,Women are made for making babies and cooking d...,1.0
...,...,...
993,From the midnight sun where the hot springs blow,0.0
994,Don't say I'm not your type,0.0
995,And therefore never send to know for whom the...,0.0
996,And I can't stand another day,0.0


In [7]:
def prepare_data(data):
    return data.apply(
      lambda x: ' '.join(
          token.lemma_.lower() for token in nlp(x) if 
          not token.is_stop 
          and not token.is_punct
          and not token.is_digit
          and not token.like_email
          and not token.like_num
          and not token.is_space
          and token.is_ascii
          and token.is_alpha
      )
  )

In [8]:
df['comment'] = prepare_data(df['comment'])

In [9]:
df

Unnamed: 0,comment,isHate
0,know woman sport joke,1.0
1,look like sloth deep syndrome,1.0
2,look like russian speak like indian disgusting...,1.0
3,woman deserve abuse guess,1.0
4,woman make baby cooking dinner,1.0
...,...,...
993,midnight sun hot spring blow,0.0
994,type,0.0
995,send know bell toll,0.0
996,stand day,0.0


In [13]:
train, test = train_test_split(df, test_size=0.2)

In [14]:
train

Unnamed: 0,comment,isHate
864,want people throw skittle wedding,0.000000
950,congratu fuckin lation pitbull owner getting b...,0.000000
837,m glad friend,0.000000
566,people nation end migration,0.166667
567,ok hell terrorist gun ammos,0.166667
...,...,...
437,yo go hell homosexual god like,0.333333
922,god humble question u almighty y u send earth ...,0.000000
810,love roast,0.000000
16,clear european differ nigger shitskin human,1.000000


In [15]:
train.to_csv("data/train.csv", sep=';')
test.to_csv("data/test.csv", sep=';')

**Train model**

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import yaml
import json
import optuna

In [23]:
params = {}
with open('params.yaml') as file:
    params = yaml.load(file, Loader=yaml.FullLoader)

In [25]:
params['count-vectorizer']['min_df']

0.945

In [48]:
train_df = pd.read_csv("data/train.csv", sep=';')
test_df = pd.read_csv("data/test.csv", sep=';')

In [57]:
def objective(trial):
    min_df = trial.suggest_loguniform('min_df', 0.00001, 0.25)
    max_df = trial.suggest_loguniform('max_df', 0.75, 1.0)
    model = Pipeline(
        steps=[
            ('counter', CountVectorizer(min_df=min_df, max_df=max_df)),
            ('reg', LinearRegression())
        ]
    )
    model.fit(train_df['comment'], train_df['isHate'])
    predictions = model.predict(test_df['comment'])
    mse = mean_squared_error(test_df['isHate'], predictions)
    return mse

In [61]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=15)

# Get the best hyperparameters and corresponding metrics
best_params = study.best_params
best_mse = study.best_value

[I 2023-06-10 02:02:41,908] A new study created in memory with name: no-name-0a71be66-aaa9-4eaa-8c95-ad7102e1b652
  min_df = trial.suggest_loguniform('min_df', 0.00001, 0.25)
  max_df = trial.suggest_loguniform('max_df', 0.75, 1.0)
[I 2023-06-10 02:02:41,945] Trial 0 finished with value: 0.1354294743210134 and parameters: {'min_df': 0.01585453197488779, 'max_df': 0.9280149071912048}. Best is trial 0 with value: 0.1354294743210134.
  min_df = trial.suggest_loguniform('min_df', 0.00001, 0.25)
  max_df = trial.suggest_loguniform('max_df', 0.75, 1.0)
[I 2023-06-10 02:02:41,996] Trial 1 finished with value: 0.3743646349247611 and parameters: {'min_df': 0.00044579365685716723, 'max_df': 0.8445671172101714}. Best is trial 0 with value: 0.1354294743210134.
  min_df = trial.suggest_loguniform('min_df', 0.00001, 0.25)
  max_df = trial.suggest_loguniform('max_df', 0.75, 1.0)
[I 2023-06-10 02:02:42,013] Trial 2 finished with value: 0.2933887405669014 and parameters: {'min_df': 0.004983660196059041

[I 2023-06-10 02:02:42,215] Trial 9 finished with value: 11.434777587380701 and parameters: {'min_df': 0.0023297370768997345, 'max_df': 0.9558561293138197}. Best is trial 0 with value: 0.1354294743210134.
  min_df = trial.suggest_loguniform('min_df', 0.00001, 0.25)
  max_df = trial.suggest_loguniform('max_df', 0.75, 1.0)
[I 2023-06-10 02:02:42,227] Trial 10 finished with value: 0.13782279886022078 and parameters: {'min_df': 0.01363682703044258, 'max_df': 0.9052169226955166}. Best is trial 0 with value: 0.1354294743210134.
  min_df = trial.suggest_loguniform('min_df', 0.00001, 0.25)
  max_df = trial.suggest_loguniform('max_df', 0.75, 1.0)
[I 2023-06-10 02:02:42,238] Trial 11 finished with value: 0.13484973543117262 and parameters: {'min_df': 0.010849501093295796, 'max_df': 0.9081204842579499}. Best is trial 11 with value: 0.13484973543117262.
  min_df = trial.suggest_loguniform('min_df', 0.00001, 0.25)
  max_df = trial.suggest_loguniform('max_df', 0.75, 1.0)
[I 2023-06-10 02:02:42,250] 

In [62]:
best_params

{'min_df': 0.01832513553891874, 'max_df': 0.9964486658390322}

In [63]:
best_mse

0.12862070360082226

**Save results**

In [64]:
params['count-vectorizer'] = best_params

with open('params.yaml', 'w') as file:
    documents = yaml.dump(params, file)


In [66]:
metrics = {}
metrics['mse'] = best_mse

with open('metrics.json', 'w') as f:
    json.dump(metrics, f)

In [67]:
import pickle

with open('models/linreg.pkl','wb') as f:
    pickle.dump(model,f)