In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
url = 'https://docs.google.com/spreadsheets/d/17gwptKOr03IIYs41n5hPrG81qm1i62eUyTdcX4FgYGI/edit?gid=1779823967#gid=1779823967'
csv_url = url.replace('/edit?gid=', '/export?format=csv&gid=')

df = pd.read_csv(csv_url)
df

In [None]:
df.drop(columns = ['Timestamp'], inplace = True)
df

In [None]:
df['Faculty'].fillna('High School', inplace = True)
df

In [5]:
def fill_level_of_education(row: pd.Series) -> None:

	if row['Age'] < 18 or pd.isna(row['Faculty']):
		return 'High school'

	return 'Bachelor Degree'

In [None]:
df['What level of education are you currently studying?'] = df.apply(fill_level_of_education, axis=1)
df['Year'].fillna(0, inplace = True)

df

In [7]:
expenses_range = {
	"<10,000": 1,
    ">10,000 but <15,000": 2,
    ">15,000 but <20,000": 3,
    ">20,000": 4
}

excercise_value = {
	"Never": 1,
	"Rarely": 2,
	"Weekly": 3,
	"Daily": 4
}

frequency_to_number = {
	"Never": 1,
	"Rarely": 2,
	"Sometimes": 3,
	"Often": 4,
	"Always": 5
}

In [None]:
df.columns

In [None]:
df['What is your average monthly expense as a student?'] = [expenses_range[val] for val in df['What is your average monthly expense as a student?']]
df['How often do you exercise?'] = [excercise_value[val] for val in df['How often do you exercise?']]
df['Do you feel stressed due to academic pressure? '] = [frequency_to_number[val] for val in df['Do you feel stressed due to academic pressure? ']]

df

In [None]:
sns.histplot(df['Age'])

In [None]:
sns.histplot(df['Faculty'])

In [None]:
sns.scatterplot(
    df[['What is your average monthly expense as a student?', 'Do you feel stressed due to academic pressure? ']],
    legend = False
)

plt.title('Hour of sleep VS Academic stress')
plt.show()

In [None]:
sns.scatterplot(
    df[['How many hours do you sleep on average?', 'Do you feel stressed due to academic pressure? ']],
    legend = False
)

plt.title('Hour of sleep VS Academic stress')
plt.show()

In [None]:
sns.__version__

In [None]:
sns.heatmap(df.corr(numeric_only = True).fillna(1))

In [None]:
x = df['How many hours do you sleep on average?']
y = df['Do you feel stressed due to academic pressure? ']

coeff = np.polyfit(x, y, deg = 1)
linear_regression_func = np.poly1d(coeff)

plt.plot(x, y, 'yo', x, linear_regression_func(x), '--k')
plt.title('Hour of sleep VS Academic stress (Prediction)')

plt.show()

In [None]:
# Predicting academic stress level from hour of sleep
hours_of_sleep = 5
print(f'Academic stress level when sleep {hours_of_sleep} hours: {linear_regression_func(hours_of_sleep)}')

In [None]:
df.select_dtypes('object')

In [None]:
transformer = dict()

for col in df.select_dtypes('object').columns:
    transformer[col] = preprocessing.LabelEncoder()
    df[col] = transformer[col].fit_transform(df[col])

(train_df, test_df) = train_test_split(df, test_size = 0.2)
train_df

In [None]:
features = [
	# Select your topic
]

target_topic = 'What is your preferred learning method?'

if target_topic in features:
    raise KeyError('feature containing label data. Try using another question.')

x = train_df[features]
y = train_df[target_topic]

dtree = RandomForestClassifier()
dtree = dtree.fit(x, y)
dtree

In [None]:
sample = test_df
x_sample = sample[features].copy()

for col in x_sample.columns:

	try:
		x_sample.loc[:, col] = transformer[col].inverse_transform(x_sample[col])
		
	except KeyError:
		continue

x_sample.loc[:, f'{target_topic} (Prediction)'] = transformer[target_topic].inverse_transform(dtree.predict(sample[features]))
x_sample.loc[:, f'{target_topic} (Actual)'] = transformer[target_topic].inverse_transform(sample[target_topic])
x_sample.loc[:, 'Confidence (%)'] = [round(float(max(e)), 2) for e in dtree.predict_proba(sample[features]) * 100]

print(f'Prediction accuracy: {dtree.score(sample[features], sample[target_topic])*100:.2f}%')
print(f'Average confidence: {x_sample["Confidence (%)"].mean():.2f}%')

x_sample