#  Merging Data Frames and use Python ML <font color=blue>sklearn</font> for Multiple Linear Regression

# <font color=red> how many males have IAH > 30 values?</font>

In [None]:
import pandas as pd

import numpy as np

In [None]:
OSA_Clinical_df = pd.read_csv("/resources/data/MSTC/OSA_DB_UPM_Clinical.csv",sep=';',na_values=['-1'])

In [None]:
OSA_Clinical_df=OSA_Clinical_df.dropna()

In [None]:
OSA_Clinical_df.describe()

### Reading Demographic Info

In [None]:
OSA_Demo_df = pd.read_csv("/resources/data/MSTC/OSA_DB_UPM_Age_Gender.csv",na_values=['-1'])

OSA_Clinical_df=OSA_Clinical_df.dropna()

OSA_Clinical_df.describe()

## DATA AGREGATION

In [None]:

# JOIN OSA_Demo_df and OSA_Clinical_df
#https://chrisalbon.com/python/pandas_join_merge_dataframe.html

OSA_inner_df = pd.merge(OSA_Clinical_df, OSA_Demo_df, on='ID', how='inner')



In [None]:
OSA_inner_df.head(5)

In [None]:
OSA_inner_df.describe()

In [None]:
Result= OSA_inner_df[OSA_inner_df['IAH'] > 30].groupby('Gender')['IAH'].count()

print(Result)


In [None]:
print('Number males IAH > 30 = ', Result[0])

### Other types of join (merge)

In [None]:
OSA_left = pd.merge(left=OSA_Clinical_df,right=OSA_Demo_df, how='left', left_on='ID', right_on='ID')

In [None]:
OSA_left.describe()

In [None]:
# Merge with outer join

OSA_outer=pd.merge(OSA_Clinical_df, OSA_Demo_df, on='ID', how='outer')

In [None]:
OSA_outer.describe()

In [None]:
OSA_outer.tail(20)

In [None]:
OSA_inner_df.dtypes

In [None]:
OSA_outer.dtypes

In [None]:
OSA_outer['ID']=OSA_outer['ID'].astype('int')

In [None]:
OSA_outer.dtypes

In [None]:
OSA_outer.tail(20)

## Let's analyze <font color=brown>OSA_inner_df</font> and use Python ML <font color=blue>sklearn</font> for Multiple Linear Regression

In [None]:
#### Scatter Plots
import matplotlib.pyplot as plt
%matplotlib inline

plt.scatter(x=OSA_inner_df['IAH'].values, y=OSA_inner_df['Weight'].values)
plt.show()

In [None]:
plt.scatter(x=OSA_inner_df.IAH.values, y=OSA_inner_df.Weight.values)
plt.show()

In [None]:
# Change size of all plots 

import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = 14, 12

In [None]:
figure = pd.tools.plotting.scatter_matrix(OSA_inner_df, alpha=0.8)

plt.show()

In [None]:
# Set ID as index 

OSA_inner_df.set_index('ID', inplace=True)

In [None]:
OSA_inner_df.index

In [None]:
# The whole corr matrix
Corr_matrix=OSA_inner_df.corr()

In [None]:
OSA_inner_df.head(5)

In [None]:
Corr_matrix

In [None]:
# PLOT Corr Matrix
# seaborn: statistical data visualization
# https://seaborn.pydata.org/
    
import seaborn as sns

sns.heatmap(Corr_matrix,
            mask=np.zeros_like(Corr_matrix, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True)

## Python ML <font color='blue'>sklearn</font> for Multiple Linear Regression

In [None]:

### Picking predictor columns

# Get all the columns from the dataframe.
columns = OSA_inner_df.columns.tolist()

# Filter the columns to remove ones we don't want.
columns = [c for c in columns if c not in ["IAH","Gender"]]

# Store the variable we'll be predicting on.
target = "IAH"

In [None]:
columns

In [None]:
## Splitting into train and test sets

# Import a convenience function to split the sets.

#from sklearn.model_selection import train_test_split

# Generate the training set.  Set random_state to be able to replicate results.
train = OSA_inner_df.sample(frac=0.8, random_state=1)


# Select anything not in the training set and put it in the testing set.
test = OSA_inner_df.loc[~OSA_inner_df.index.isin(train.index)]

# Print the shapes of both sets.
print(train.shape)
print(test.shape)


In [None]:
# Fitting a linear regression

# Import the linear models.
from sklearn import linear_model

# Initialize the model class.

model= linear_model.LinearRegression()

#model= linear_model.Ridge(alpha = 0.5)
# Fit the model to the training data.
Trained_model=model.fit(train[columns], train[target])

In [None]:
### Predicting Error

# Import the scikit-learn function to compute error.
from sklearn.metrics import mean_squared_error

# Generate our predictions for the test set.
predictions = model.predict(test[columns])

# Compute error between our test predictions and the actual values.
mean_squared_error(predictions, test[target])

In [None]:
# The coefficients
print('Coefficients: \n', model.coef_)

In [None]:
# Explained variance score: 
from sklearn.metrics import r2_score

print('Variance score: %.2f' % r2_score(test[target], predictions))

In [None]:
# Plot outputs
plt.scatter(test[target], predictions,  color='black')

plt.show()

In [None]:
### Predicting Training Error

# Import the scikit-learn function to compute error.
from sklearn.metrics import mean_squared_error

# Generate our predictions for the test set.
predictions = model.predict(train[columns])

# Compute error between our test predictions and the actual values.
mean_squared_error(predictions, train[target])

In [None]:
# Plot outputs
plt.scatter(train[target], predictions,  color='black')

plt.show()

## Try improve using "feature" engineering ...

In [None]:
# ANOTHER WAY of doing:
# OSA_Clinical_df['BMI']=OSA_inner_df.apply(lambda row: row['Weight']/np.power(row['Height']/100,2), axis=1)

In [None]:
def BMIcalc(weight,height):
    return weight/np.power(height/100,2)
    

In [None]:
OSA_inner_df['BMI']=OSA_inner_df.apply(lambda row: BMIcalc(row['Weight'],row['Height']), axis=1)

In [None]:
OSA_inner_df.head(5)

In [None]:
### Picking predictor columns

# Get all the columns from the dataframe.
columns = OSA_inner_df.columns.tolist()

# Filter the columns to remove ones we don't want.
columns = [c for c in columns if c not in ["IAH","Gender"]]


# Store the variable we'll be predicting on.
target = "IAH"

## Splitting into train and test sets

# Import a convenience function to split the sets.

#from sklearn.model_selection import train_test_split

# Generate the training set.  Set random_state to be able to replicate results.
train = OSA_inner_df.sample(frac=0.8, random_state=1)


# Select anything not in the training set and put it in the testing set.
test = OSA_inner_df.loc[~OSA_inner_df.index.isin(train.index)]


In [None]:
columns

In [None]:
# Initialize the model class.

model= linear_model.LinearRegression()

#model= linear_model.Ridge(alpha = 0.5)
# Fit the model to the training data.
Trained_model=model.fit(train[columns], train[target])

### Predicting Error

# Import the scikit-learn function to compute error.
from sklearn.metrics import mean_squared_error

# Generate our predictions for the test set.
predictions = model.predict(test[columns])

# Compute error between our test predictions and the actual values.
mean_squared_error(predictions, test[target])

In [None]:
# Explained variance score: 
from sklearn.metrics import r2_score

print('Variance score: %.2f' % r2_score(test[target], predictions))

In [None]:
# The coefficients
print('Coefficients: \n', model.coef_)

In [None]:
columns

### Getting statistical info with <font color=blue>statsmodels</font> package

In [None]:
! pip install statsmodels


In [None]:
import statsmodels.api as sm

X= train[columns]
X = sm.add_constant(X)
est = sm.OLS(train[target], X)
est2 = est.fit()
print(est2.summary())