## Load file from Cloud Drive
**Reference :**<br>
https://medium.freecodecamp.org/how-to-transfer-large-files-to-google-colab-and-remote-jupyter-notebooks-26ca252892fa

In [0]:
!pip install PyDrive

In [0]:
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
download = drive.CreateFile({'id': '1lvWgPEUddtJx2ylOuaTkptBG1Pkintzm'})
download.GetContentFile('data_converted.csv')

In [0]:
"""
Recommend！
Mount the whole drive
"""
from google.colab import drive
drive.mount('/content/a')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/a


## Data Preprocessing

In [0]:
# Load dataset
import pandas as pd
import numpy as np
path='a/My Drive/Thermal Comfort/Friends center dataset/data_converted.csv'
df = pd.read_csv(path,skiprows=0)
df.head()

In [0]:
df.shape

(840984, 118)

In [0]:
# Drop data where survey time=null
df1=df.dropna(subset=['Survey Time'])
# print(df1.shape)

X_df=df1.iloc[:,5:14] # select 'Environment' category
y1_df=df1.iloc[:,36] # select 'General Thermal Comfort (right now)' as label
y2_df=df1.iloc[:,47] # select 'Thermal Sensation (right now)'
y3_df=df1.iloc[:,50] # select 'Relative Productivity (recently)'
df1_combined=pd.concat([X_df,y1_df,y2_df,y3_df],axis=1)
df1_combined.to_csv('data_v1.csv',index=0)

## Feature Engineering<br>
Feature: Enviroment variables<br>
Label: Thermal comfort, Thermal sensation, Productivity

In [0]:
# Load dataset
import pandas as pd
import numpy as np

filepath='a/My Drive/Thermal Comfort/Friends center dataset/data_v1.csv'
df = pd.read_csv(filepath,skiprows=0)
#df.shape
df.head()

Unnamed: 0,INDOOR Ambient Temp.,INDOOR Relative Humidity,INDOOR Air Velocity,INDOOR Mean Radiant Temp.,INDOOR Lumens,INDOOR CO2,OUTDOOR Ambient Temp.,OUTDOOR Relative Humidity,OUTDOOR Air Velocity,General Thermal Comfort (right now),Thermal Sensation (right now),Relative Productivity (recently)
0,23.721585,61.785,0.030988,23.721585,,602.0,28.0,66.0,4.4704,4.0,0.0,
1,23.980518,58.67,0.030988,23.980518,,736.3,30.0,46.0,4.02336,3.0,-2.5,3.0
2,24.066829,54.12,0.030988,24.066829,,753.4,32.0,38.0,5.81152,5.0,0.0,2.0
3,23.29003,49.107,0.030988,23.29003,,666.1,27.0,51.0,1.34112,4.0,-1.0,
4,23.548963,48.594,0.030988,23.548963,,699.6,28.0,48.0,1.34112,5.0,0.0,3.0


In [0]:
"""One-hot test"""
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
Thermal_Comfort_now=df['General Thermal Comfort (right now)']
Thermal_Comfort_now=Thermal_Comfort_now.fillna(Thermal_Comfort_now.mean())
Thermal_Comfort_now=Thermal_Comfort_now.values.reshape(-1,1)
enc.fit(Thermal_Comfort_now)
enc.transform(Thermal_Comfort_now).toarray()
#Thermal_Comfort_now

In [0]:
"""correlation analysis"""
# data = df[['INDOOR Ambient Temp.','INDOOR Relative Humidity', ['General Thermal Comfort (right now)'],['Thermal Sensation (right now)']]]
df.corr(method='pearson', min_periods=1)


### Corr & P-value for General Thermal Comfort

In [0]:
"""p-value between INDOOR Ambient Temp. & General Thermal Comfort (right now)"""
from scipy import stats

df=df.fillna(df.mean())
In_Ambient_Temp=df['INDOOR Ambient Temp.'].values
Thermal_Comfort_now=df['General Thermal Comfort (right now)'].values
# print (stats.ttest_ind(In_Mean_Radiant_Temp,Thermal_Sensation_now,nan_policy='omit'))

# Reture [Pearson correlation coefficient, p-value for testing non-correlation]
print(stats.pearsonr(In_Ambient_Temp,Thermal_Comfort_now)) 

(-0.053976004887036286, 0.006980006559794964)


In [0]:
"""p-value between INDOOR Relative Humidity & General Thermal Comfort (right now)"""
from scipy import stats

df=df.fillna(df.mean())
In_Hum=df['INDOOR Relative Humidity'].values
Thermal_Comfort_now=df['General Thermal Comfort (right now)'].values
# print (stats.ttest_ind(In_Mean_Radiant_Temp,Thermal_Sensation_now,nan_policy='omit'))

# Reture [Pearson correlation coefficient, p-value for testing non-correlation]
print(stats.pearsonr(In_Hum,Thermal_Comfort_now))

(-0.03281526658521945, 0.10113018789473265)


In [0]:
"""p-value between INDOOR Air Velocity & General Thermal Comfort (right now)"""
from scipy import stats

df=df.fillna(df.mean())
In_Air_Velo=df['INDOOR Air Velocity'].values
Thermal_Comfort_now=df['General Thermal Comfort (right now)'].values
# print (stats.ttest_ind(In_Mean_Radiant_Temp,Thermal_Sensation_now,nan_policy='omit'))

# Reture [Pearson correlation coefficient, p-value for testing non-correlation]
print(stats.pearsonr(In_Air_Velo,Thermal_Comfort_now)) 

(-0.05296531476852682, 0.008115912311827952)


In [0]:
"""p-value between INDOOR Mean Radiant Temp. & General Thermal Comfort (right now)"""
from scipy import stats

df=df.fillna(df.mean())
In_Mean_Radiant=df['INDOOR Mean Radiant Temp.'].values
Thermal_Comfort_now=df['General Thermal Comfort (right now)'].values
# print (stats.ttest_ind(In_Mean_Radiant_Temp,Thermal_Sensation_now,nan_policy='omit'))

# Reture [Pearson correlation coefficient, p-value for testing non-correlation]
print(stats.pearsonr(In_Mean_Radiant,Thermal_Comfort_now))

(-0.03983851238916029, 0.04653329785323916)


### Corr & P-value for Thermal Sensation (right now)

In [0]:
"""p-value between INDOOR Ambient Temp. & Thermal Sensation (right now)"""
from scipy import stats

df=df.fillna(df.mean())
In_Ambient_Temp=df['INDOOR Ambient Temp.'].values
Thermal_Sensation_now=df['Thermal Sensation (right now)'].values
# print (stats.ttest_ind(In_Mean_Radiant_Temp,Thermal_Sensation_now,nan_policy='omit'))

# Reture [Pearson correlation coefficient, p-value for testing non-correlation]
print(stats.pearsonr(In_Ambient_Temp,Thermal_Sensation_now)) 



(0.18948557451236514, 1.2907577641512907e-21)


In [0]:
"""p-value between INDOOR Relative Humidity & Thermal Sensation (right now)"""
from scipy import stats

df=df.fillna(df.mean())
In_Hum=df['INDOOR Relative Humidity'].values
Thermal_Sensation_now=df['Thermal Sensation (right now)'].values
# print (stats.ttest_ind(In_Mean_Radiant_Temp,Thermal_Sensation_now,nan_policy='omit'))

# Reture [Pearson correlation coefficient, p-value for testing non-correlation]
print(stats.pearsonr(In_Hum,Thermal_Sensation_now))

(-0.08535596420315596, 1.9468831482985634e-05)


In [0]:
"""p-value between INDOOR Mean Radiant Temp. & Thermal Sensation (right now)"""
from scipy import stats

# df=df.fillna(df.mean())
In_Mean_Radiant_Temp=df['INDOOR Mean Radiant Temp.'].values
Thermal_Sensation_now=df['Thermal Sensation (right now)'].values

# Reture [Pearson correlation coefficient, p-value for testing non-correlation]
print(stats.pearsonr(In_Mean_Radiant_Temp,Thermal_Sensation_now))

(0.19737649930538578, 2.3728590904967714e-23)


### Corr & P-value for Relative Productivity (recently)

In [0]:
df['Relative Productivity (recently)'].describe()

In [0]:
# Drop data where productivity=null
df_prod=df.dropna(subset=['Relative Productivity (recently)'])
df_prod.shape

(1275, 12)

In [0]:
"""p-value between INDOOR Relative Humidity & Relative Productivity (recently)"""
from scipy import stats

df_prod=df_prod.fillna(df_prod.mean())
In_Humi=df_prod['INDOOR Relative Humidity'].values
Product_now=df_prod['Relative Productivity (recently)'].values

# Reture [Pearson correlation coefficient, p-value for testing non-correlation]
print(stats.pearsonr(In_Humi,Product_now))



(-0.10021689486811504, 0.00033840452289756633)


In [0]:
"""p-value between INDOOR Air Velocity & Relative Productivity (recently)"""
from scipy import stats

df_prod=df_prod.fillna(df_prod.mean())
In_Velo=df_prod['INDOOR Air Velocity'].values
Product_now=df_prod['Relative Productivity (recently)'].values

# Reture [Pearson correlation coefficient, p-value for testing non-correlation]
print(stats.pearsonr(In_Velo,Product_now))

(-0.08968344134564671, 0.0013475314989388053)


In [0]:
"""p-value between OUTDOOR Ambient Temp. & Relative Productivity (recently)"""
from scipy import stats

df_prod=df_prod.fillna(df_prod.mean())
Out_Temp=df_prod['OUTDOOR Ambient Temp.'].values
Product_now=df_prod['Relative Productivity (recently)'].values

# Reture [Pearson correlation coefficient, p-value for testing non-correlation]
print(stats.pearsonr(Out_Temp,Product_now))

(-0.08512216116525326, 0.0023500379018934533)


In [0]:
# Test for 'FeatureSelector' package
# !pip install lightgbm
from feature_selector import FeatureSelector
import pandas as pd
fs = FeatureSelector(data = X_df, labels = y_df)
fs.identify_collinear(correlation_threshold=0.1)
fs.plot_collinear()

## Regression


### Features: Environment+gender+age; Label: General thermal comfort(right now)

In [0]:
tempDict = {}
for dt in df.loc[:, "Time"]:
  temp = dt.split()

  if temp[0] in tempDict:
    tempDict[temp[0]] += 1
  else:
    tempDict[temp[0]] = 1

tempDict
# print(tempDict)

# for key, value in tempDict.items():
#   print("{}: {}".format(key, value))

# print(tempDict)

# a = [1,2,4]
# b = tuple(a)
# d = {}
# d[b] = 1
# print(d)

# a = []
# a = list()

# a = [1,2,4]
# b = a[0]
# d = {}
# d[b] = 1
# print(d)

In [0]:
#label
y_df=df.iloc[:,36] # select 'General Thermal Comfort (right now)' as label
y_df=y_df[df['General Thermal Comfort (right now)']>0]
y_df.describe()
	
import seaborn as sns
from scipy.stats import norm
sns.distplot(y_df,fit=norm)

In [0]:
df['General Thermal Comfort'].describe()

In [0]:
# Predict General Thermal Comfort (right now) with Linear Regression
# features
X1_df=df.iloc[:,5:14] # select 'Environment' category
X2_df=df.iloc[:,25:27] # select gender and age
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df[df['General Thermal Comfort (right now)']>0]
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df.iloc[:,36] # select 'General Thermal Comfort (right now)' as label
y_df=y_df[df['General Thermal Comfort (right now)']>0]
# y_df=y_df.fillna(y_df.mean())
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model 
regressor = LinearRegression()
# regressor.fit(X_train, y_train)
regressor.fit(X_train_scaled, y_train)

# Predicting the Test set results
#y_pred = regressor.predict(X_test)
y_pred = regressor.predict(X_test_scaled)
print ("R2 is: %f" % r2_score(y_test, y_pred))
print ("MSE is: %f" % mean_squared_error(y_test, y_pred))
print ("MAE is: %f" % mean_absolute_error(y_test, y_pred))

R2 is: 0.018371
MSE is: 0.696815
MAE is: 0.696517


In [0]:
# Predict General Thermal Comfort (right now) with SVR
# features
X1_df=df.iloc[:,5:14] # select 'Environment' category
X2_df=df.iloc[:,25:27] # select gender and age
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df[df['General Thermal Comfort (right now)']>0]
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df.iloc[:,36] # select 'General Thermal Comfort (right now)' as label
y_df=y_df[df['General Thermal Comfort (right now)']>0]
# y_df=y_df.fillna(y_df.mean())
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model 
svr = GridSearchCV(SVR(kernel='rbf'), cv=5,
                   param_grid={"C": [0.1, 1, 2, 4],
                               "gamma": np.logspace(-2, 2, 5)})
#svr.fit(X_train, y_train) 
svr.fit(X_train_scaled, y_train) 
print (svr.best_params_)
print (svr.refit) # Check whether refit or not

# Predict and evaluate
y_pred = svr.predict(X_test_scaled)
print ("R2 is: %f" % r2_score(y_test, y_pred))
print ("MSE is: %f" % mean_squared_error(y_test, y_pred))
print ("MAE is: %f" % mean_absolute_error(y_test, y_pred))


{'C': 10.0, 'gamma': 0.1}
True
R2 is: 0.048152
MSE is: 0.675675
MAE is: 0.623798


In [0]:
# Predict General Thermal Comfort (right now) with RF regressor
# features
X1_df=df.iloc[:,5:14] # select 'Environment' category
X2_df=df.iloc[:,25:27] # select gender and age
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df[df['General Thermal Comfort (right now)']>0]
X_df=X_df.fillna(X_df.mean())
X=X_df.values
# X_df.describe()
# X_df.shape

#label
y_df=df.iloc[:,36] # select 'General Thermal Comfort (right now)' as label
y_df=y_df[df['General Thermal Comfort (right now)']>0]
# y_df=y_df.fillna(y_df.mean())
y=y_df.values
# print(y_df.describe())
# print(y_df.shape)

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


pipeline = make_pipeline( 
                         RandomForestRegressor(n_estimators=100))

# Tune parameter
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [15, 12, 8, 5, 3]}
 
# Tune param and fit model 
clf = GridSearchCV(pipeline, hyperparameters, cv=7)
clf.fit(X_train, y_train) 
# clf.fit(X_train_scaled, y_train) 
print (clf.best_params_)
print (clf.refit) # Check whether refit or not

# Predict and evaluate
y_pred = clf.predict(X_test)
print ("R2 is: %f" % r2_score(y_test, y_pred))
print ("MSE is: %f" % mean_squared_error(y_test, y_pred))
print ("MAE is: %f" % mean_absolute_error(y_test, y_pred))


{'randomforestregressor__max_depth': 12, 'randomforestregressor__max_features': 'auto'}
True
R2 is: 0.285396
MSE is: 0.507266
MAE is: 0.562763


In [0]:
# Predict General Thermal Comfort (right now) with NN Regressor
import numpy as np
np.random.seed(1337)  # for reproducibility
from keras.models import Sequential
from keras.layers import Dense # Full connected

# features
X1_df=df.iloc[:,5:14] # select 'Environment' category
X2_df=df.iloc[:,25:27] # select gender and age
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df[df['General Thermal Comfort (right now)']>0]
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df.iloc[:,36] # select 'General Thermal Comfort (right now)' as label
y_df=y_df[df['General Thermal Comfort (right now)']>0]
# y_df=y_df.fillna(y_df.mean())
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create model
model = Sequential()
# model.add(Dense(output_dim=1, input_dim=11,activation='relu')) # 定义下一层时，会自动把上层的output作为input
#model.add(Dense(1)) 
model.add(Dense(11, input_dim=11, init='normal', activation='relu'))
model.add(Dense(1, init='normal'))

# Declare loss function and optimizer
model.compile(loss='mean_squared_error', optimizer='adam')

# Training
print('Training -----------')
for step in range(15001):
    cost = model.train_on_batch(X_train, y_train)
    if step % 1000 == 0:
        print('train cost: ', cost)
        
# test
print('\nTesting ------------')
cost = model.evaluate(X_test, y_test, batch_size=40)
print('test cost:', cost)
W, b = model.layers[0].get_weights()
print('Weights=', W, '\nbiases=', b)

# predict and evaluate
y_pred = model.predict(X_test)
print ("R2 is: %f" % r2_score(y_test, y_pred))
print ("MSE is: %f" % mean_squared_error(y_test, y_pred))
print ("MAE is: %f" % mean_absolute_error(y_test, y_pred))


Using TensorFlow backend.


Training -----------
train cost:  26.99479
train cost:  0.7631929
train cost:  0.7331186
train cost:  0.70960736
train cost:  0.69406295
train cost:  0.68729
train cost:  0.68479836
train cost:  0.6782065
train cost:  0.6444712
train cost:  0.62972385
train cost:  0.628761
train cost:  0.6202479
train cost:  0.61995447
train cost:  0.61773175
train cost:  0.62303656
train cost:  0.6153509

Testing ------------
test cost: 0.6137274646759033
Weights= [[-4.21481058e-02 -3.66475374e-01  3.13000679e-01 -1.00253150e-01
   2.12153280e-03  1.21023603e-01  4.38553691e-02 -1.74892962e-01
  -1.34120258e-02 -6.68890774e-02 -1.27892435e-01]
 [-1.11041024e-01 -1.25436619e-01  2.95314603e-02  6.23297356e-02
  -6.18607551e-02  2.15747599e-02  6.12894446e-03  3.86846736e-02
   1.60675291e-02 -4.25639749e-02 -6.88211471e-02]
 [ 1.59026440e-02 -3.31042218e+00  4.90839052e+00 -3.00621128e+00
  -9.78393137e-01 -8.60334635e-01  8.35618004e-02 -2.71093917e+00
  -3.54489908e-02 -8.96623135e-02 -7.28476718e-02

## Classification

### Features: Environment+gender+age; Label: General thermal comfort(right now)暂时不用

In [0]:
# Predict General Thermal Comfort (right now) with Lightgbm
# features
X1_df=df.iloc[:,5:14] # select 'Environment' category
X2_df=df.iloc[:,25:27] # select gender and age
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df[df['General Thermal Comfort (right now)']>0]
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df.iloc[:,36] # select 'General Thermal Comfort (right now)' as label
y_df=y_df[df['General Thermal Comfort (right now)']>0]
# y_df=y_df.fillna(y_df.mean())
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#  Fit data format
# train_data=lgb.Dataset(X_train,label=y_train)
# validation_data=lgb.Dataset(X_test,label=y_test)
train_data=lgb.Dataset(X_train_scaled,label=y_train)
validation_data=lgb.Dataset(X_test_scaled,label=y_test)
# Build parameter list and model
params = {'objective': 'multiclass',
                  'learning_rate': 0.02,
                  'lambda_l1': 0.1,
                  'lambda_l2': 0.2,
                  'max_depth': 7,
                  'num_leaves': 14,
                  'is_unbalance': 'true',
                  'num_class': 50
}
model=lgb.train(params, train_data, valid_sets=[validation_data], num_boost_round=250)

# Obtain results
# y_pred=model.predict(X_test)
y_pred=model.predict(X_test_scaled)
y_pred=[list(x).index(max(x)) for x in y_pred]
from sklearn.metrics import classification_report,accuracy_score
print('acc score: %f' % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))

In [0]:
# Predict General Thermal Comfort (right now) with SVM
# features
X1_df=df.iloc[:,5:14] # select 'Environment' category
X2_df=df.iloc[:,25:27] # select gender and age
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df[df['General Thermal Comfort (right now)']>0]
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df.iloc[:,36] # select 'General Thermal Comfort (right now)' as label
y_df=y_df[df['General Thermal Comfort (right now)']>0]
# y_df=y_df.fillna(y_df.mean())
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV,cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model 
parameters = {'kernel':('sigmoid', 'rbf'), 'C':[1, 2,3, 4,6,8], 'gamma':[0.125, 0.25, 0.5 ,1, 2, 4]}
svc=svm.SVC()
clf = GridSearchCV(svc, parameters)

clf.fit(X_train_scaled, y_train) 
#clf.fit(X_train,y_train) 
print (clf.best_params_)
print (clf.refit) # Check whether refit or not

# Predict and evaluate
# y_pred = clf.predict(X_test)
y_pred = clf.predict(X_test_scaled)
from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))


In [0]:
# Predict General Thermal Comfort (right now) with Logistic Regression
# features
X1_df=df.iloc[:,5:14] # select 'Environment' category
X2_df=df.iloc[:,25:27] # select gender and age
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df[df['General Thermal Comfort (right now)']>0]
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df.iloc[:,36] # select 'General Thermal Comfort (right now)' as label
y_df=y_df[df['General Thermal Comfort (right now)']>0]
# y_df=y_df.fillna(y_df.mean())
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model 
classifier = LogisticRegression()
# classifier.fit(X_train, y_train)
classifier.fit(X_train_scaled, y_train)

# Predicting the Test set results
# y_pred = classifier.predict(X_test)
y_pred = classifier.predict(X_test_scaled)

from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))

acc score is: 0.518000
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00         1
        2.0       0.00      0.00      0.00         7
        3.0       0.00      0.00      0.00        56
        4.0       0.34      0.19      0.24       151
        5.0       0.55      0.91      0.69       253
        6.0       0.67      0.06      0.11        32

avg / total       0.43      0.52      0.43       500



  'precision', 'predicted', average, warn_for)


In [0]:
# Predict General Thermal Comfort (right now) with Keras Clf
import numpy as np 
np.random.seed(1337)  # for reproducibility
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import RMSprop
from keras.utils import np_utils

# features
X1_df=df.iloc[:,5:14] # select 'Environment' category
X2_df=df.iloc[:,25:27] # select gender and age
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df[df['General Thermal Comfort (right now)']>0]
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df.iloc[:,36] # select 'General Thermal Comfort (right now)' as label
y_df=y_df[df['General Thermal Comfort (right now)']>0]
# y_df=y_df.fillna(y_df.mean())
y=y_df.values

from keras.utils import to_categorical
y = to_categorical(y)

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(X_df, y,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model
model = Sequential([
    Dense(1, input_dim=11), # output is 1-D, input is 11-D
    Activation('relu'),
    Dense(1), # output is 1-D
    Activation('softmax'),
])

# Another way to define your optimizer
rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

# Compile model and declare loss function and optimizer
model.compile(optimizer=rmsprop,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Fit model
print('Training ------------')
# Another way to train the model
model.fit(X_train, y_train, nb_epoch=10, batch_size=32) # Train 10 times

print (model.refit) # Check whether refit or not

# Predict and evaluate
loss, accuracy = model.evaluate(X_test, y_test)

print('test loss: ', loss)
print('test accuracy: ', accuracy)
from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))


### Features: Environment Label: General thermal comfort 暂时不用

In [0]:
# Predict General Thermal Comfort (right now) with Lightgbm
# !pip install LightGBM
# features
X1_df=df.iloc[:,5:14] # select 'Environment' category
X2_df=df.iloc[:,25:27] # select gender and age
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df[df['General Thermal Comfort']>0]
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df.iloc[:,53] # select 'General Thermal Comfort' as label
y_df=y_df[df['General Thermal Comfort']>0]
# y_df=y_df.fillna(y_df.mean())
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#  Fit data format
# train_data=lgb.Dataset(X_train,label=y_train)
# validation_data=lgb.Dataset(X_test,label=y_test)
train_data=lgb.Dataset(X_train_scaled,label=y_train)
validation_data=lgb.Dataset(X_test_scaled,label=y_test)
# Build parameter list and model
params = {'objective': 'multiclass',
                  'learning_rate': 0.02,
                  'lambda_l1': 0.1,
                  'lambda_l2': 0.2,
                  'max_depth': 7,
                  'num_leaves': 14,
                  'is_unbalance': 'true',
                  'num_class': 50
}
model=lgb.train(params, train_data, valid_sets=[validation_data], num_boost_round=250)

# Obtain results
# y_pred=model.predict(X_test)
y_pred=model.predict(X_test_scaled)
y_pred=[list(x).index(max(x)) for x in y_pred]
from sklearn.metrics import classification_report,accuracy_score
print('acc score: %f' % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))

In [0]:
# Predict General Thermal Comfort (right now) with SVM
# features
X1_df=df.iloc[:,5:14] # select 'Environment' category
X2_df=df.iloc[:,25:27] # select gender and age
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df[df['General Thermal Comfort']>0]
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df.iloc[:,53] # select 'General Thermal Comfort (right now)' as label
y_df=y_df[df['General Thermal Comfort']>0]
# y_df=y_df.fillna(y_df.mean())
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV,cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model 
parameters = {'kernel':('sigmoid', 'rbf'), 'C':[1,2,3, 4,6,8], 'gamma':[0.125, 0.25, 0.5 ,1, 2, 4]}
svc=svm.SVC()
clf = GridSearchCV(svc, parameters)

clf.fit(X_train_scaled, y_train)
#clf.fit(X_train,y_train) 
print (clf.best_params_)
print (clf.refit) # Check whether refit or not

# Predict and evaluate
# y_pred = clf.predict(X_test)
y_pred = clf.predict(X_test_scaled)
from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))


In [0]:
# Predict General Thermal Comfort (right now) with Logistic Regression
# features
X1_df=df.iloc[:,5:14] # select 'Environment' category
X2_df=df.iloc[:,25:27] # select gender and age
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df[df['General Thermal Comfort']>0]
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df.iloc[:,53] # select 'General Thermal Comfort (right now)' as label
y_df=y_df[df['General Thermal Comfort']>0]
# y_df=y_df.fillna(y_df.mean())
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model 
classifier = LogisticRegression()
# classifier.fit(X_train, y_train)
classifier.fit(X_train_scaled, y_train)

# Predicting the Test set results
# y_pred = classifier.predict(X_test)
y_pred = classifier.predict(X_test_scaled)

from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))

### Features: INDOOR Ambient Temp.+INDOOR Relative Humidity+INDOOR Air Velocity+INDOOR Mean Radiant Temp.; Label: General thermal comfort(right now)

In [0]:
#label
y_df=df.iloc[:,9].copy()
y_df.value_counts() # Results show that this label is imbalanced

In [0]:
"""Imbalance class resample"""
from sklearn.utils import resample
y_df=df.iloc[:,9].copy()

# Separate majority and minority classes
df_major = df[y_df==5.0]
df_minor1 = df[y_df==4.0]
df_minor2 = df[y_df==3.0]
df_minor3 = df[y_df==6.0]
df_minor4 = df[y_df==2.0]
df_minor5 = df[y_df==1.0]

# Downsample majority class
df_major_downsampled = resample(df_major, 
                                 replace=False,    # sample without replacement
                                 n_samples=1000,   # to match minority class
                                 random_state=123) # reproducible results

# Upsample minority class
df_minor1_upsampled = resample(df_minor1, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor2_upsampled = resample(df_minor2, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor3_upsampled = resample(df_minor3, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor4_upsampled = resample(df_minor4, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor5_upsampled = resample(df_minor5, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_resample = pd.concat([df_major_downsampled, df_minor1_upsampled,df_minor2_upsampled,
                         df_minor3_upsampled,df_minor4_upsampled,df_minor5_upsampled])

# Display new class counts
df_resample['General Thermal Comfort (right now)'].value_counts()

1.0    1000
2.0    1000
6.0    1000
3.0    1000
4.0    1000
5.0    1000
Name: General Thermal Comfort (right now), dtype: int64

In [0]:
"""Predict General Thermal Comfort (right now) with Logistic Regression"""
# features
X_df=df_resample.iloc[:,:4] # select first four features
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df_resample.iloc[:,9].copy() # select 'General Thermal Comfort (right now)' as label
y=y_df.values

"""One hot
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
y_df=y_df.values.reshape(-1,1)
enc.fit(y_df)
enc.transform(y_df).toarray()"""


from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model 
classifier = LogisticRegression()
# classifier.fit(X_train, y_train)
classifier.fit(X_train_scaled, y_train)

# Predicting the Test set results
# y_pred = classifier.predict(X_test)
y_pred = classifier.predict(X_test_scaled)

from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))

acc score is: 0.165000
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       200
        2.0       0.20      0.36      0.26       200
        3.0       0.23      0.07      0.10       200
        4.0       0.04      0.01      0.01       200
        5.0       0.26      0.18      0.21       200
        6.0       0.15      0.38      0.21       200

avg / total       0.15      0.17      0.13      1200



  y = column_or_1d(y, warn=True)


In [0]:
# Predict General Thermal Comfort (right now) with SVM
# features
# features
X_df=df_resample.iloc[:,:4] # select first four features
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df_resample.iloc[:,9].copy() # select 'General Thermal Comfort (right now)' as label
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV,cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model 
parameters = {'kernel':('sigmoid', 'rbf'), 'C':[1, 2,3, 4,6,8], 'gamma':[0.125, 0.25, 0.5 ,1, 2, 4]}
svc=svm.SVC()
clf = GridSearchCV(svc, parameters)

clf.fit(X_train_scaled, y_train) 
#clf.fit(X_train,y_train) 
print (clf.best_params_)
print (clf.refit) # Check whether refit or not

# Predict and evaluate
# y_pred = clf.predict(X_test)
y_pred = clf.predict(X_test_scaled)
from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))


In [0]:
"""Predict General Thermal Comfort (right now) with Lightgbm"""

# features
X_df=df_resample.iloc[:,:4] # select first four features
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df_resample.iloc[:,9].copy() # select 'General Thermal Comfort (right now)' as label
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#  Fit data format
# train_data=lgb.Dataset(X_train,label=y_train)
# validation_data=lgb.Dataset(X_test,label=y_test)
train_data=lgb.Dataset(X_train_scaled,label=y_train)
validation_data=lgb.Dataset(X_test_scaled,label=y_test)
# Build parameter list and model
params = {'objective': 'multiclass',
                  'learning_rate': 0.02,
                  'lambda_l1': 0.1,
                  'lambda_l2': 0.2,
                  'max_depth': 7,
                  'num_leaves': 14,
                  'is_unbalance': 'true',
                  'num_class': 50
}
model=lgb.train(params, train_data, valid_sets=[validation_data], num_boost_round=250)

# Obtain results
# y_pred=model.predict(X_test)
y_pred=model.predict(X_test_scaled)
y_pred=[list(x).index(max(x)) for x in y_pred]
from sklearn.metrics import classification_report,accuracy_score
print('acc score: %f' % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))

[1]	valid_0's multi_logloss: 1.77798
[2]	valid_0's multi_logloss: 1.7647
[3]	valid_0's multi_logloss: 1.75201
[4]	valid_0's multi_logloss: 1.73964
[5]	valid_0's multi_logloss: 1.72821
[6]	valid_0's multi_logloss: 1.71687
[7]	valid_0's multi_logloss: 1.70606
[8]	valid_0's multi_logloss: 1.69564
[9]	valid_0's multi_logloss: 1.68605
[10]	valid_0's multi_logloss: 1.67655
[11]	valid_0's multi_logloss: 1.66666
[12]	valid_0's multi_logloss: 1.65756
[13]	valid_0's multi_logloss: 1.64858
[14]	valid_0's multi_logloss: 1.63957
[15]	valid_0's multi_logloss: 1.63133
[16]	valid_0's multi_logloss: 1.62325
[17]	valid_0's multi_logloss: 1.61457
[18]	valid_0's multi_logloss: 1.60677
[19]	valid_0's multi_logloss: 1.59953
[20]	valid_0's multi_logloss: 1.59204
[21]	valid_0's multi_logloss: 1.58375
[22]	valid_0's multi_logloss: 1.57615
[23]	valid_0's multi_logloss: 1.56936
[24]	valid_0's multi_logloss: 1.562
[25]	valid_0's multi_logloss: 1.55438
[26]	valid_0's multi_logloss: 1.54793
[27]	valid_0's multi_log

In [0]:
# Predict General Thermal Sensation (right now) with RF Classifier
from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
np.random.seed(0) # Set random seed

# features
X_df=df_resample.iloc[:,:4] # select first four features
X_df=X_df.fillna(X_df.mean())
X=X_df.values
# print(X_df.head())

#label
y_df=df_resample.iloc[:,9].copy() # select 'General Thermal Comfort (right now)' as label
y=y_df.values
# print(y_df.head())
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

#Create a Gaussian Classifier
rfc=RandomForestClassifier(n_estimators=80)
rfc.fit(X_train,y_train)

# Predicting the Test set results
y_pred=rfc.predict(X_test)

from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))

### Features: INDOOR Ambient Temp. + INDOOR Mean Radiant Temp.; Label: thermal sensation(right now)

In [0]:
#label
y_df=df.iloc[:,10].copy()
y_df.value_counts() # Results show that this label is imbalanced

# import seaborn as sns
# from scipy.stats import norm
# sns.distplot(y_df,fit=norm) # Imbalance class

In [0]:
"""Imbalance class resample"""
from sklearn.utils import resample
y_df=df.iloc[:,10].copy()

# Separate majority and minority classes
df_major = df[y_df==0.0]
df_minor1 = df[y_df==1]
df_minor2 = df[y_df==2]
df_minor3 = df[y_df==2.5]
df_minor4 = df[y_df==3]
df_minor5 = df[y_df==-1]
df_minor6 = df[y_df==-2]
df_minor7 = df[y_df==-2.5]
df_minor8 = df[y_df==-3]
# y_df_minor8.value_counts()

# Downsample majority class
df_major_downsampled = resample(df_major, 
                                 replace=False,    # sample without replacement
                                 n_samples=1000,   # to match minority class
                                 random_state=123) # reproducible results

# Upsample minority class
df_minor1_upsampled = resample(df_minor1, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor2_upsampled = resample(df_minor2, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor3_upsampled = resample(df_minor3, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor4_upsampled = resample(df_minor4, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor5_upsampled = resample(df_minor5, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor6_upsampled = resample(df_minor6, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor7_upsampled = resample(df_minor7, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor8_upsampled = resample(df_minor8, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_resample = pd.concat([df_major_downsampled, df_minor1_upsampled,df_minor2_upsampled,
                         df_minor3_upsampled,df_minor4_upsampled,df_minor5_upsampled,
                         df_minor6_upsampled,df_minor7_upsampled,df_minor8_upsampled])

# Display new class counts
df_resample['Thermal Sensation (right now)'].value_counts()

In [0]:
"""Imbalance class resample without 2.5 & 3"""
from sklearn.utils import resample
y_df=df.iloc[:,10].copy()

# Separate majority and minority classes
df_major = df[y_df==0.0]
df_minor1 = df[y_df==1]
df_minor2 = df[y_df==2]
# df_minor3 = df[y_df==2.5]
# df_minor4 = df[y_df==3]
df_minor5 = df[y_df==-1]
df_minor6 = df[y_df==-2]
df_minor7 = df[y_df==-2.5]
df_minor8 = df[y_df==-3]

# Downsample majority class
df_major_downsampled = resample(df_major, 
                                 replace=False,    # sample without replacement
                                 n_samples=1000,   # to match minority class
                                 random_state=123) # reproducible results

# Upsample minority class
df_minor1_upsampled = resample(df_minor1, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor2_upsampled = resample(df_minor2, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results
"""
df_minor3_upsampled = resample(df_minor3, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor4_upsampled = resample(df_minor4, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results
"""

df_minor5_upsampled = resample(df_minor5, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor6_upsampled = resample(df_minor6, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor7_upsampled = resample(df_minor7, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results

df_minor8_upsampled = resample(df_minor8, 
                                 replace=True,     # sample with replacement
                                 n_samples=1000,   # to match majority class
                                 random_state=123) # reproducible results


# df_resample = pd.concat([df_major_downsampled, df_minor1_upsampled,df_minor2_upsampled,
#                          df_minor3_upsampled,df_minor4_upsampled,df_minor5_upsampled,
#                          df_minor6_upsampled,df_minor7_upsampled,df_minor8_upsampled])

df_resample = pd.concat([df_major_downsampled, df_minor1_upsampled,df_minor2_upsampled,
                         df_minor5_upsampled,
                         df_minor6_upsampled,df_minor7_upsampled,df_minor8_upsampled])

# Display new class counts
df_resample['Thermal Sensation (right now)'].value_counts()

In [0]:
# Predict General Thermal Sensation (right now) with Logistic Regression
# features
X1_df=df_resample.iloc[:,:2] # select 'INDOOR Ambient Temp.' + 'INDOOR Relative Humidity'
X2_df=df_resample.iloc[:,3] # select 'INDOOR Mean Radiant Temp.'
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df_resample.iloc[:,10].copy() # select 'General Thermal Sensation (right now)' as label
y_df*=2 # tackle with -2.5 and 2.5
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model 
classifier = LogisticRegression()
# classifier.fit(X_train, y_train)
classifier.fit(X_train_scaled, y_train)

# Predicting the Test set results
# y_pred = classifier.predict(X_test)
y_pred = classifier.predict(X_test_scaled)

from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))

In [0]:
# Predict Thermal Sensation (right now) with SVM
# features
X1_df=df_resample.iloc[:,2] # select 'INDOOR Ambient Temp.' + 'INDOOR Relative Humidity'
X2_df=df_resample.iloc[:,3] # select 'INDOOR Mean Radiant Temp.'
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df_resample.iloc[:,10].copy() # select 'General Thermal Sensation (right now)' as label
y_df*=2 # tackle with -2.5 and 2.5
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV,cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model 
parameters = {'kernel':('sigmoid', 'rbf'), 'C':[1,2,4,6,8], 'gamma':[0.25, 0.5 ,1, 2, 4]}
svc=svm.SVC()
clf = GridSearchCV(svc, parameters)

clf.fit(X_train_scaled, y_train)
#clf.fit(X_train,y_train) 
print (clf.best_params_)
print (clf.refit) # Check whether refit or not

# Predict and evaluate
# y_pred = clf.predict(X_test)
y_pred = clf.predict(X_test_scaled)
from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))


In [0]:
# Predict Thermal Comfort (right now) with Lightgbm
# !pip install lightGBM
# features
X1_df=df_resample.iloc[:,:2] # select 'INDOOR Ambient Temp.' + 'In Humidity'
X2_df=df_resample.iloc[:,3] # select 'INDOOR Mean Radiant Temp.'
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df_resample.iloc[:,10].copy() # select 'General Thermal Comfort (right now)' as label
y_df+=3 # scale up 3 unit
y_df*=2 # tackle with -2.5 and 2.5
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#  Fit data format
# train_data=lgb.Dataset(X_train,label=y_train)
# validation_data=lgb.Dataset(X_test,label=y_test)
train_data=lgb.Dataset(X_train_scaled,label=y_train)
validation_data=lgb.Dataset(X_test_scaled,label=y_test)
# Build parameter list and model
params = {'objective': 'multiclass',
                  'learning_rate': 0.02,
                  'lambda_l1': 0.1,
                  'lambda_l2': 0.2,
                  'max_depth': 7,
                  'num_leaves': 14,
                  'is_unbalance': 'true',
                  'num_class': 50
}
model=lgb.train(params, train_data, valid_sets=[validation_data], num_boost_round=250)

# Obtain results
# y_pred=model.predict(X_test)
y_pred=model.predict(X_test_scaled)
y_pred=[list(x).index(max(x)) for x in y_pred]

from sklearn.metrics import classification_report,accuracy_score
print('acc score: %f' % accuracy_score(y_test,y_pred))
# print('acc score: %f' % accuracy_score(y_test,y_pred.round()))
print (classification_report(y_test,y_pred))
# print (classification_report(y_test,y_pred.round()))

In [0]:
# Predict General Thermal Sensation (right now) with RF Classifier
from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
np.random.seed(0) # Set random seed

# features
X1_df=df_resample.iloc[:,:2] # select 'INDOOR Ambient Temp.' + 'INDOOR Relative Humidity'
X2_df=df_resample.iloc[:,3] # select 'INDOOR Mean Radiant Temp.'
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df.fillna(X_df.mean())
X=X_df.values

#label
y_df=df_resample.iloc[:,10].copy() # select 'General Thermal Sensation (right now)' as label
y_df*=2 # tackle with -2.5 and 2.5
y=y_df.values

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

#Create a Gaussian Classifier
rfc=RandomForestClassifier(n_estimators=80)
rfc.fit(X_train,y_train)

# Predicting the Test set results
y_pred=rfc.predict(X_test)

from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))

### Features: INDOOR Relative Humidity + INDOOR Air Velocity + OUTDOOR Ambient Temp.; Label: Productivity


In [0]:
#label
y_df=df_prod.iloc[:,11].copy()
y_df.value_counts() # Results show that this label is imbalanced

# import seaborn as sns
# from scipy.stats import norm
# sns.distplot(y_df,fit=norm) # Imbalance class

In [0]:
"""Imbalance class resample"""
from sklearn.utils import resample
y_df=df_prod.iloc[:,11].copy()

# Separate majority and minority classes
df_major = df_prod[y_df==3.0]
df_minor1 = df_prod[y_df==4.0]
df_minor2 = df_prod[y_df==2.0]
df_minor3 = df_prod[y_df==5.0]
df_minor4 = df_prod[y_df==1.0]

# Downsample majority class
df_major_downsampled = resample(df_major, 
                                 replace=False,    # sample without replacement
                                 n_samples=800,   # to match minority class
                                 random_state=123) # reproducible results

# Upsample minority class
df_minor1_upsampled = resample(df_minor1, 
                                 replace=True,     # sample with replacement
                                 n_samples=800,   # to match majority class
                                 random_state=123) # reproducible results

df_minor2_upsampled = resample(df_minor2, 
                                 replace=True,     # sample with replacement
                                 n_samples=800,   # to match majority class
                                 random_state=123) # reproducible results

df_minor3_upsampled = resample(df_minor3, 
                                 replace=True,     # sample with replacement
                                 n_samples=800,   # to match majority class
                                 random_state=123) # reproducible results

df_minor4_upsampled = resample(df_minor4, 
                                 replace=True,     # sample with replacement
                                 n_samples=800,   # to match majority class
                                 random_state=123) # reproducible results

df_resample = pd.concat([df_major_downsampled, df_minor1_upsampled,df_minor2_upsampled,
                         df_minor3_upsampled,df_minor4_upsampled])

# Display new class counts
df_resample['Relative Productivity (recently)'].value_counts()

In [0]:
# Predict Relative Productivity (recently) with Logistic Regression
# features
X1_df=df_resample.iloc[:,1:3] # select 'INDOOR Relative Humidity' + 'INDOOR Air Velocity'
X2_df=df_resample['OUTDOOR Ambient Temp.'] # select 'OUTDOOR Ambient Temp.'
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df.fillna(X_df.mean())
X=X_df.values
# print(X_df)

#label
y_df=df_resample.iloc[:,11].copy() # select 'Relative Productivity (recently)' as label
# print(y_df)
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model 
lrc = LogisticRegression()
# classifier.fit(X_train, y_train)
lrc.fit(X_train_scaled, y_train)

# Predicting the Test set results
# y_pred = classifier.predict(X_test)
y_pred = lrc.predict(X_test_scaled)

from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))

In [0]:
# Predict Relative Productivity (recently) with SVM
# features
X1_df=df_resample.iloc[:,1:3] # select 'INDOOR Relative Humidity' + 'INDOOR Air Velocity'
X2_df=df_resample['OUTDOOR Ambient Temp.'] # select 'OUTDOOR Ambient Temp.'
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df.fillna(X_df.mean())
X=X_df.values
# print(X_df)

#label
y_df=df_resample.iloc[:,11].copy() # select 'Relative Productivity (recently)' as label
# print(y_df)
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV,cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tune param and fit model 
parameters = {'kernel':('sigmoid', 'rbf'), 'C':[1,2,4,6,8], 'gamma':[0.25, 0.5 ,1, 2, 4]}
svc=svm.SVC()
clf = GridSearchCV(svc, parameters)

clf.fit(X_train_scaled, y_train)
#clf.fit(X_train,y_train) 
print (clf.best_params_)
print (clf.refit) # Check whether refit or not

# Predict and evaluate
# y_pred = clf.predict(X_test)
y_pred = clf.predict(X_test_scaled)
from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))


In [0]:
# Predict Relative Productivity (recently) with Lightgbm
# !pip install lightGBM
# features
X1_df=df_resample.iloc[:,1:3] # select 'INDOOR Relative Humidity' + 'INDOOR Air Velocity'
X2_df=df_resample['OUTDOOR Ambient Temp.'] # select 'OUTDOOR Ambient Temp.'
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df.fillna(X_df.mean())
X=X_df.values
# print(X_df)

#label
y_df=df_resample.iloc[:,11].copy() # select 'Relative Productivity (recently)' as label
# print(y_df)
y=y_df.values

from sklearn.model_selection import train_test_split	
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

# Standardization
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

# Apply transformer to train/test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#  Fit data format
# train_data=lgb.Dataset(X_train,label=y_train)
# validation_data=lgb.Dataset(X_test,label=y_test)
train_data=lgb.Dataset(X_train_scaled,label=y_train)
validation_data=lgb.Dataset(X_test_scaled,label=y_test)
# Build parameter list and model
params = {'objective': 'multiclass',
                  'learning_rate': 0.02,
                  'lambda_l1': 0.1,
                  'lambda_l2': 0.2,
                  'max_depth': 7,
                  'num_leaves': 14,
                  'is_unbalance': 'true',
                  'num_class': 50
}
model=lgb.train(params, train_data, valid_sets=[validation_data], num_boost_round=250)

# Obtain results
# y_pred=model.predict(X_test)
y_pred=model.predict(X_test_scaled)
y_pred=[list(x).index(max(x)) for x in y_pred]

from sklearn.metrics import classification_report,accuracy_score
print('acc score: %f' % accuracy_score(y_test,y_pred))
# print('acc score: %f' % accuracy_score(y_test,y_pred.round()))
print (classification_report(y_test,y_pred))
# print (classification_report(y_test,y_pred.round()))

In [0]:
# Predict Relative Productivity (recently) with RF Classifier
# features
X1_df=df_resample.iloc[:,1:3] # select 'INDOOR Relative Humidity' + 'INDOOR Air Velocity'
X2_df=df_resample['OUTDOOR Ambient Temp.'] # select 'OUTDOOR Ambient Temp.'
X_df=pd.concat([X1_df,X2_df],axis=1)
X_df=X_df.fillna(X_df.mean())
X=X_df.values
# print(X_df)

#label
y_df=df_resample.iloc[:,11].copy() # select 'Relative Productivity (recently)' as label
# print(y_df)
y=y_df.values

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,test_size=0.2,random_state=123,stratify=y)

#Create a Gaussian Classifier
rfc=RandomForestClassifier(n_estimators=80)
rfc.fit(X_train,y_train)

# Predicting the Test set results
y_pred=rfc.predict(X_test)

from sklearn.metrics import classification_report,accuracy_score
print("acc score is: %f" % accuracy_score(y_test,y_pred))
print (classification_report(y_test,y_pred))