In [30]:
import pandas as pd
import numpy as np
import os
import pandas_profiling as pp
import streamlit as st
from streamlit_pandas_profiling import st_profile_report
from sklearn.cluster import KMeans
import pickle
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates
from sklearn.preprocessing import StandardScaler

%matplotlib inline

### Load ML for Good  Prolific Academic Data 
data_adult_apr_20, data_adult_apr_21, data_adult_may_20, data_adult_nov_20 

data_parent_apr_20, data_parent_apr_21, data_parent_may_20, data_parent_nov_20

In [2]:
path_to_prolific_data = "ML-for-Good-Hackathon\Data\ProlificAcademic"

# Adult data
data_adult_apr_20 = pd.read_csv(path_to_prolific_data+"\April 2020\Data\CRISIS_Adult_April_2020.csv", low_memory=False) 
data_adult_apr_21 = pd.read_csv(path_to_prolific_data+"\April 2021\Data\CRISIS_Adult_April_2021.csv", low_memory=False)
data_adult_may_20 = pd.read_csv(path_to_prolific_data+"\May 2020\Data\CRISIS_Adult_May_2020.csv", low_memory=False)
data_adult_nov_20 = pd.read_csv(r"ML-for-Good-Hackathon\Data\ProlificAcademic\November 2020\Data\CRISIS_Adult_November_2020.csv", low_memory=False)

# Parent data
data_parent_apr_20 = pd.read_csv(path_to_prolific_data+"\April 2020\Data\CRISIS_Parent_April_2020.csv", low_memory=False) 
data_parent_apr_21 = pd.read_csv(path_to_prolific_data+"\April 2021\Data\CRISIS_Parent_April_2021.csv", low_memory=False) 
data_parent_may_20 = pd.read_csv(path_to_prolific_data+"\May 2020\Data\CRISIS_Parent_May_2020.csv", low_memory=False) 
data_parent_nov_20 = pd.read_csv(r"ML-for-Good-Hackathon\Data\ProlificAcademic\November 2020\Data\CRISIS_Parent_November_2020.csv", low_memory=False) 


### Concatenate parent data and display basic stats

In [17]:
all_parent_data = pd.concat([data_parent_apr_20,data_parent_apr_21,data_parent_may_20,data_parent_nov_20], ignore_index=True)


all_parent_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
country,7239.0,186.352535,2.730057,17.0,186.000000,186.0,187.0,187.000000
age,7228.0,43.086884,389.385569,0.0,34.000000,38.0,43.0,33137.000000
sex,2221.0,1.474561,0.505739,1.0,1.000000,1.0,2.0,3.000000
raceethnicity___1,3814.0,0.774777,0.417784,0.0,1.000000,1.0,1.0,1.000000
raceethnicity___2,3814.0,0.027530,0.163644,0.0,0.000000,0.0,0.0,1.000000
...,...,...,...,...,...,...,...,...
substanceproblems2,54.0,0.148148,0.358583,0.0,0.000000,0.0,0.0,1.000000
substancecomplaint2,54.0,0.037037,0.190626,0.0,0.000000,0.0,0.0,1.000000
Nov_cms_ave,775.0,2.169935,0.684103,1.0,1.700000,2.1,2.6,4.800000
Nov_cw_ave,777.0,2.173531,0.673257,1.0,1.666667,2.0,2.5,4.833333


### Concatenate parent data and display basic stats

In [20]:
all_adult_data = pd.concat([data_adult_apr_20,data_adult_apr_21, data_adult_may_20, data_adult_nov_20])


all_adult_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
country,10351.0,186.342189,3.335535,17.000000,186.000000,186.000000,187.000000,187.0
age,10340.0,38.956383,14.746920,10.000000,27.000000,36.000000,51.000000,83.0
sex,2839.0,1.579429,0.501526,1.000000,1.000000,2.000000,2.000000,3.0
raceethnicity___1,5091.0,0.677274,0.467565,0.000000,0.000000,1.000000,1.000000,1.0
raceethnicity___2,5091.0,0.022392,0.147971,0.000000,0.000000,0.000000,0.000000,1.0
...,...,...,...,...,...,...,...,...
smokesoon,190.0,3.910526,0.321035,2.000000,4.000000,4.000000,4.000000,4.0
friendscigarette,190.0,3.800000,0.536499,1.000000,4.000000,4.000000,4.000000,4.0
Nov_cms_ave,849.0,2.639694,0.796700,1.000000,2.100000,2.600000,3.100000,5.0
Nov_cw_ave,858.0,2.713481,0.747382,1.166667,2.166667,2.666667,3.166667,5.0




###  Slice data and get interactive report on basic stats

In [42]:
features = ['age', 'sex', 'country', 'gender', 'raceethnicity___1']
sliced_parent_data = all_parent_data[features]
profile = pp.ProfileReport(sliced_parent_data, title='Pandas Profiling Report')
profile.to_widgets()

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …

### Get HTML report

In [5]:
profile.to_file('myreport.html')




### Explore data with K-Means Clustering

In [43]:
#Combine data related to raceethnicity into 1 column

all_parent_data['raceethnicity___1'] = 1*all_parent_data['raceethnicity___1'] 
all_parent_data['raceethnicity___2'] = 2*all_parent_data['raceethnicity___2'] 
all_parent_data['raceethnicity___3'] = 3*all_parent_data['raceethnicity___3'] 
all_parent_data['raceethnicity___4'] = 4*all_parent_data['raceethnicity___4'] 
all_parent_data['raceethnicity___5'] = 5*all_parent_data['raceethnicity___5'] 

all_parent_data['raceethnicity'] = all_parent_data['raceethnicity___1']+all_parent_data['raceethnicity___2']+all_parent_data['raceethnicity___3']+all_parent_data['raceethnicity___4']+all_parent_data['raceethnicity___5']

In [44]:
features = ['age', 'sex', 'country', 'gender', 'raceethnicity']
sliced_parent_data = all_parent_data[features]

X = StandardScaler().fit_transform(sliced_parent_data)
kmeans = KMeans(n_clusters=4)
model = kmeans.fit(X)


centers = model.cluster_centers_
centers



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').