<font size="5">__Symptoms prediction__<font>

In this notebook, we use data given by Pslove about their user's periods

# Data mining

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from joblib import dump, load

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [4]:
# Import files
user = pd.read_csv('User.csv', index_col=0)
period = pd.read_csv('Period.csv', index_col=0)
symptoms = pd.read_csv('Symptom.csv', index_col=0)

# Change data types to nice date types
user.cycle_length_initial = pd.to_timedelta(user.cycle_length_initial, unit='D').dt.days
user.period_length_initial = pd.to_timedelta(user.period_length_initial, unit='D').dt.days
user.dob = pd.to_datetime(user.dob, dayfirst=True, errors='coerce')
period.start_date = pd.to_datetime(period.start_date, dayfirst=True, errors='coerce')
period.end_date = pd.to_datetime(period.end_date, dayfirst=True, errors='coerce')
symptoms.date = pd.to_datetime(symptoms.date, dayfirst=True, errors='coerce')

In [8]:
# Calculate period length
period = period.join(user, on='User_id')
period['length'] = (period.end_date-period.start_date).dt.days + 1
period = period.drop(period[period.length<2].index)
period = period.drop(period[period.length>11].index)

# Calculate the rank of the user's period
period['order'] = 0
for user_id in user.index:
    index = period[period.User_id == user_id].sort_values('start_date').index
    period.loc[index, 'order'] = pd.Series(range(len(index)), index=index)

In [13]:
# Calculate days between periods
diff = period.loc[:, ['User_id', 'start_date']].sort_values(['User_id', 'start_date']).diff()
period['days_since_last'] = ((1 - diff.User_id) * diff.start_date).sort_index().dt.days
# If no previous period saved, fix length at 'cycle_length_initial'
index = period.loc[period.days_since_last==0].index
period.loc[index, 'days_since_last'] = period.cycle_length_initial

In [10]:
def last_period(x):
    # Return last first day of period
    user_period = period[period.User_id == x.user_id]
    last_date = user_period[user_period.start_date <= x.date].start_date.max()
    return last_date

def last_period_order(x):
    # Return last period order
    user_period = period[period.User_id == x.user_id]
    last_order = user_period[user_period.start_date <= x.date].order.max()
    return last_order

def last_len_period(x):
    # Return last period length
    liste = period[(period.User_id==x.user_id)&(period.start_date==x.last_period)].length
    if len(liste)==0:
        return 0
    else:
        return liste.iloc[0]

def last_len_cycle(x):
    # Return last cycle length
    liste = period[(period.User_id==x.user_id)&(period.start_date==x.last_period)].days_since_last
    if len(liste)==0:
        return 0
    else:
        return liste.iloc[0]

In [17]:
# Get features about last period
symptoms['last_period'] = symptoms.apply(last_period, axis=1)
symptoms['last_order'] = symptoms.apply(last_period_order, axis=1)
symptoms['last_len_period'] = symptoms.apply(last_len_period, axis=1)
symptoms['last_len_cycle'] = symptoms.apply(last_len_cycle, axis=1)
symptoms = symptoms.join(user, on='user_id')
# Days since period begin
symptoms['period_day'] = (symptoms.date - symptoms.last_period).dt.days

In [19]:
# Calculate mean cycle length
mean_cycle = period[period.days_since_last<50].groupby('User_id').mean()['days_since_last']
symptoms = symptoms.join(mean_cycle.rename('mean_cycle'), on='user_id')
# Calculate mean period length
mean_period = period.groupby('User_id').mean()['length']
symptoms = symptoms.join(mean_period.rename('mean_period'), on='user_id')

In [28]:
# Create features and labels matrices
X = symptoms.loc[:, ['period_day', 'last_len_period', 'last_len_cycle', 'mean_cycle', 'mean_period']]
Y = symptoms.loc[:, ['acne', 'backache', 'bloating', 'cramp', 'diarrhea', 'dizzy', 'headache', 'mood', 'nausea', 'sore']]
#
X.last_len_period = X.last_len_period.fillna(X.mean_period)
X = X.dropna()
Y = Y.loc[X.index]
# Split traint test
X_train, X_test, y_train, y_test = train_test_split(X, Y)


In [775]:

rfr = rfr.fit(X_train, y_train)
y_predict = rfr.predict(X_test)
#scores = cross_val_score(rfr, X, Y, cv=5)



# Machine learning

In [31]:
# Grid search to get the best model using RandomForest
# Specify parameters to sample from
param_grid = {"n_estimators" : [20, 50, 100],
              "max_depth": [5, 10, None],
              "max_features": [2, 3, 4],
              "min_samples_split": [5, 10, 20],
              "bootstrap": [True, False]}
#
rfr = RandomForestRegressor()
# run grid search
grid_search = GridSearchCV(rfr, param_grid, cv=3, n_jobs=-1)
grid_search.fit(X, Y)
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 3,
 'min_samples_split': 10,
 'n_estimators': 100}

In [37]:
# Save model
dump(rfr, 'model1.joblib')

['model1.joblib']

In [38]:
# Get the model (from grid search or load it)
#rfr = grid_search.best_estimator_
rfr = load('model1.joblib')

# Fit and get score
rfr.fit(X_train, y_train)
rfr.score(X_test, y_test)

0.10366016267801274