<a href="https://www.kaggle.com/code/yanngrente/migraine-with-logistic-regression?scriptVersionId=143531542" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Metrics and data preparation
from sklearn.metrics import auc, confusion_matrix, balanced_accuracy_score, precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

# Classifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Import of the dataset.
df = pd.read_csv('/kaggle/input/migraine-dataset/migraine_data.csv')

# First look.
df.head()

In [None]:
# General description of the data.
df.describe()

## Duplicated and missing values

In [None]:
# duplicated rows
print(f'There are {df.duplicated().sum()} duplicated rows')

In [None]:
dup = df.duplicated()

In [None]:
# What the duplicated look like?
df[dup]

We are looking to structured data where the same symptoms are the signs of the same disease. So we choose to keep the duplicated rows.

In [None]:
# Missing values.
df.isna().sum()

In [None]:
df['Type'].value_counts()

In [None]:
# We are going to encode this values into numeric values and drop the old column
encoder = LabelEncoder()
df['Type_num'] = encoder.fit_transform(df['Type'])
df = df.drop(columns='Type')

In [None]:
df['Type_num'].value_counts()

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

## Spliting the data

In [None]:
# The target columns is 'Type_num'
X = df.drop(columns='Type_num')
Y = df['Type_num']

In [None]:
# Spliting the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=42)

# CLASSIFIERS

### Functions

In [None]:
def score(estimator):
    """Accuracy on the train set and test set."""

    tr_score = estimator.score(X_train, Y_train).round(4)
    te_score = estimator.score(X_test, Y_test).round(4)
    
    print(f'train score:{tr_score} test score:{te_score}')
    
    
def confusion(Y_test, Y_pred):
    """Confusion matrix"""

    mat = confusion_matrix(Y_test, Y_pred)
    mat = pd.DataFrame(mat)
    mat.columns = [f'pred_Type{i}' for i in mat.columns]
    mat.index = [f'Type_{i}' for i in mat.index]
    return mat

## Dummy classifier

In [None]:
# We train a "dummy" classifier. This is the benchmark 
# against which we will evaluate our other classifiers.
estimator = DummyClassifier(strategy="most_frequent")
estimator.fit(X_train, Y_train)

In [None]:
# Predictions on the test set.
Y_pred = estimator.predict(X_test)
pd.Series(Y_pred).value_counts()

In [None]:
# It simply predicts that the next individual will be in the most frequent class.
Y_test.value_counts(normalize=True)

In [None]:
# Accuracy
score(estimator)

In [None]:
confusion(Y_test, Y_pred)

In [None]:
print(classification_report(Y_test, Y_pred))

## Logistic Regression

In [None]:
# First model.
estimator = LogisticRegression(multi_class='multinomial')
estimator.fit(X_train, Y_train)

In [None]:
Y_pred = estimator.predict(X_test)

In [None]:
score(estimator)

In [None]:
confusion(Y_test, Y_pred)

In [None]:
print(classification_report(Y_test, Y_pred))

In [None]:
estimator = LogisticRegression()

params = {'C' : np.logspace(-3,3,7),
          "penalty" : ['l1', 'l2'],
          "solver" : ["newton-ng", "lbfgs", "liblinear"]}

In [None]:
grid = GridSearchCV(estimator,
                    params,
                    cv=10,
                    n_jobs=-1,
                    return_train_score=True,
                    verbose=1)
grid.fit(X_train, Y_train)

In [None]:
# the best hyperparameters.
best_params = grid.best_params_
best_params

In [None]:
# We try the logistic regression with the best hyperparameters.
estimator = LogisticRegression(**best_params)
estimator.fit(X_train, Y_train)

In [None]:
Y_pred = estimator.predict(X_test)

In [None]:
# Accuracy
score(estimator)

In [None]:
confusion(Y_test, Y_pred)

In [None]:
print(classification_report(Y_test, Y_pred))