# Import Necessary Libraries


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'numpy'

# Import our data

In [None]:
!pip install opendatasets

In [None]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/gaurav2022/mobile-health")

In [None]:
df_health = pd.read_csv("/content/mobile-health/mhealth_raw_data.csv")

# Data Analysis

In [None]:
df_health.head()

In [None]:
df_health.info()

In [None]:
df_health.shape

In [None]:
df_health.columns

In [None]:
df_health.describe()

In [None]:
#Check data before sampling

plt.figure(figsize=(10,8))
plt.title("Data before sampling")
df_health['Activity'].value_counts().plot.bar(rot=1)

In [None]:
#Resampling activity 0 to 40000 observations
data_activity_0 = df_health[df_health['Activity'] == 0]
data_activity_else = df_health[df_health['Activity'] != 0]

data_activity_0 = data_activity_0.sample(n=40000, random_state=42)

df = pd.concat([data_activity_0,data_activity_else])

In [None]:
#Check data after sampling

plt.figure(figsize=(10,8))
plt.title("Data after sampling")
df['Activity'].value_counts().plot.bar(rot=1)

## EDA

In [None]:
plt.figure(figsize=(12,8))
round(df['Activity'].value_counts()/df.shape[0] * 100, 2).plot.pie(autopct= '%2.1f%%')

In [None]:
plt.subplots(figsize =(18,18))
sns.heatmap(df.corr(), annot=True)

In [None]:
df.shape

In [None]:
### Confidence interval

df1 = df.copy()

for feature in df1.columns[:-2]:
  lower_range = np.quantile(df[feature], 0.01)
  upper_range = np.quantile(df[feature], 0.99)
  print(feature, 'range:', lower_range, 'to', upper_range)

  df1 = df1.drop(df1[(df1[feature] > upper_range) | (df1[feature] < lower_range)].index, axis=0)
  print('shape', df1.shape)

In [None]:
df1.shape

In [None]:
df = df1.copy()

In [None]:
df = df.drop('subject', axis=1)

In [None]:
df.columns

In [None]:
# Train test split
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score,confusion_matrix, mean_absolute_error , r2_score , mean_squared_error, mean_absolute_percentage_error
x = df.drop(['Activity'], axis = 1)
y = df['Activity']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [None]:
x_train.shape

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test.shape

# Model Building && Model Evaluation

In [None]:
# Feature Scaling
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
#Logistic Regression
lr = LogisticRegression()
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
f1_score(y_test, pred,pos_label='positive', average='micro')

In [None]:
#putting models in a dictionary

models = {
    'Randomforest': RandomForestClassifier(),
    'LogisticRegression':LogisticRegression(),
    'SVC':SVC(),
    'KNeighborsClassifier':KNeighborsClassifier(),
    'GaussianNB': GaussianNB(),
    'DecisionTreeClassifier': DecisionTreeClassifier()
}

def fit_and_score(models, x_train, x_test, y_train, y_test):
  np.random.seed(42)

  #making a dictionary to keep model scores];;

  model_scores = {}

  for name, model in models.items():
    model.fit(x_train,y_train)
    preds = model.predict(x_test)
    model_scores[name] = f1_score(y_test, preds,pos_label='positive', average='micro')
  return model_scores

In [None]:
scores = fit_and_score(models, x_train, x_test, y_train, y_test)

scores

In [None]:
## We didn't do hyperparameter tuning because the model is good enough

In [None]:
# We'll be using random forest model, it has the highest accuracy in our evaluation using f1 score
# It has a 97% accuracty
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
prediction = rf.predict(x_test)
f1_score(y_test, prediction,pos_label='positive', average='micro')

# Model Deployment

In [None]:
import pickle
filename = 'rf_model.pkl'
pickle.dump(rf, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(x_test, y_test)
print(result)