# 0. Overview
- This notebook handles the exploratory data analysis of the given dataset.


In [1]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
train_df = pickle.load(open('../data/processed/train_df.pkl', 'rb'))
val_df = pickle.load(open('../data/processed/val_df.pkl', 'rb'))
test_df = pickle.load(open('../data/processed/test_df.pkl', 'rb'))

In [3]:
X_train = train_df[['title_length', 'body_length']]
X_val = val_df[['title_length', 'body_length']]
X_test = test_df[['title_length', 'body_length']]

y_train = train_df.cleaned_y.values
y_val = val_df.cleaned_y.values
y_test = test_df.cleaned_y.values

In [4]:
all_rows = []
for c in tqdm([0.001, 0.01, 0.1, 1, 10, 100, 1000]):
    clf = LogisticRegression(random_state = 0).fit(X_train, y_train)
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    val_acc = accuracy_score(y_val, clf.predict(X_val))
    all_rows.append({
        'c': c,
        'train_acc': round(train_acc, 2),
        'val_acc': round(val_acc, 2)
    })

100%|██████████| 7/7 [00:03<00:00,  2.18it/s]


In [5]:
pd.DataFrame(all_rows)

Unnamed: 0,c,train_acc,val_acc
0,0.001,0.39,0.39
1,0.01,0.39,0.39
2,0.1,0.39,0.39
3,1.0,0.39,0.39
4,10.0,0.39,0.39
5,100.0,0.39,0.39
6,1000.0,0.39,0.39


In [6]:
accuracy_score(y_test, clf.predict(X_test))

0.3862

In [7]:
test_prediction = clf.predict(X_test)

In [8]:
pickle.dump(test_prediction, open('../data/baseline_pred.pkl', 'wb'))