In [51]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
import pandas as pd                    # Data manipulation library, used to work with tabular data efficiently.
from sklearn.model_selection import train_test_split   # Split data into training and testing sets for model evaluation.
from sklearn.preprocessing import LabelEncoder       # Encode categorical variables into numerical format for modeling.
from sklearn.ensemble import RandomForestClassifier   # Ensemble machine learning model used for classification tasks.
from sklearn.linear_model import LogisticRegression   # Linear model used for binary and multiclass classification.
from sklearn.metrics import accuracy_score           # Metric to evaluate model accuracy.
import joblib## Library to save and load Python objects (e.g., models, transformers) to/from disk.

from sklearn.feature_selection import RFE             # Recursive Feature Elimination for feature selection.
from sklearn.feature_selection import SelectKBest, f_classif   # Univariate feature selection using ANOVA F-value.
import seaborn as sns                   # Data visualization library based on Matplotlib for beautiful plots.
import matplotlib.pyplot as plt        # Data visualization library for creating various types of plots.


### 1. Importing the Dataset

In [53]:
data = pd.read_csv(r"/content/drive/MyDrive/train.csv")
data_test = pd.read_csv(r"/content/drive/MyDrive/test.csv")

In [54]:
data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,STANDING
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,STANDING
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,STANDING
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,STANDING
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,STANDING


In [55]:
data.tail()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
7347,0.299665,-0.057193,-0.181233,-0.195387,0.039905,0.077078,-0.282301,0.043616,0.06041,0.210795,...,-0.880324,-0.190437,0.829718,0.206972,-0.425619,-0.791883,0.238604,0.049819,30,WALKING_UPSTAIRS
7348,0.273853,-0.007749,-0.147468,-0.235309,0.004816,0.05928,-0.322552,-0.029456,0.080585,0.11744,...,-0.680744,0.064907,0.875679,-0.879033,0.400219,-0.77184,0.252676,0.050053,30,WALKING_UPSTAIRS
7349,0.273387,-0.017011,-0.045022,-0.218218,-0.103822,0.274533,-0.304515,-0.098913,0.332584,0.043999,...,-0.304029,0.052806,-0.266724,0.864404,0.701169,-0.779133,0.249145,0.040811,30,WALKING_UPSTAIRS
7350,0.289654,-0.018843,-0.158281,-0.219139,-0.111412,0.268893,-0.310487,-0.0682,0.319473,0.101702,...,-0.344314,-0.10136,0.70074,0.936674,-0.589479,-0.785181,0.246432,0.025339,30,WALKING_UPSTAIRS
7351,0.351503,-0.012423,-0.203867,-0.26927,-0.087212,0.177404,-0.377404,-0.038678,0.22943,0.269013,...,-0.740738,-0.280088,-0.007739,-0.056088,-0.616956,-0.783267,0.246809,0.036695,30,WALKING_UPSTAIRS


In [56]:
data.shape #7352 ROWS AND 563 COLUMNS

(7352, 563)

In [57]:
print("Number of Rows",data.shape[0])
print("Number of columns",data.shape[1])

Number of Rows 7352
Number of columns 563


In [58]:
data.duplicated().any()

False

In [59]:
duplicated_columns = data.columns[data.T.duplicated()].tolist()

In [60]:
len(duplicated_columns)

21

In [61]:
data = data.drop(duplicated_columns,axis=1)

In [62]:
data.shape

(7352, 542)

In [63]:
data.isnull().sum()

tBodyAcc-mean()-X       0
tBodyAcc-mean()-Y       0
tBodyAcc-mean()-Z       0
tBodyAcc-std()-X        0
tBodyAcc-std()-Y        0
                       ..
angle(X,gravityMean)    0
angle(Y,gravityMean)    0
angle(Z,gravityMean)    0
subject                 0
Activity                0
Length: 542, dtype: int64

### 8. Store Feature Matrix In X and Response(Target) In Vector y

In [64]:
X = data.drop('Activity',axis=1)
y= data['Activity']

In [65]:
y

0               STANDING
1               STANDING
2               STANDING
3               STANDING
4               STANDING
              ...       
7347    WALKING_UPSTAIRS
7348    WALKING_UPSTAIRS
7349    WALKING_UPSTAIRS
7350    WALKING_UPSTAIRS
7351    WALKING_UPSTAIRS
Name: Activity, Length: 7352, dtype: object

In [66]:
le = LabelEncoder()
y = le.fit_transform(y)

In [67]:
y

array([2, 2, 2, ..., 5, 5, 5])

### 9. Splitting The Dataset Into The Training Set And Test Set

In [68]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,
                                               random_state=42)

### 10. Logistic Regression

In [69]:
log  = LogisticRegression()
log.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [70]:
y_pred1 = log.predict(X_test)
accuracy_score(y_test,y_pred1)

0.9809653297076818

### 11. Random Forest Classifier

In [71]:


rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [72]:
y_pred2 = rf.predict(X_test)
accuracy_score(y_test,y_pred2)

0.9802855200543847

### 12. Feature Selection

### 12.1 Filter Method

In [73]:
#The filter method is a feature selection technique that ranks features based on their statistical relevance to the target variable, aiding in selecting informative features for machine learning models in an efficient and model-agnostic manner.

In [74]:
k=200
selector = SelectKBest(f_classif,k=k)
X_train_selected = selector.fit_transform(X_train,y_train)
X_test_selected = selector.transform(X_test)


selected_indices=selector.get_support(indices=True)
selected_features = X_train.columns[selected_indices]
print(len(selected_features))

200


### 12.2 Wrapper Method

In [75]:
#The wrapper method is a feature selection technique that involves using a specific machine learning model to evaluate subsets of features. It repeatedly trains and tests the model with different feature combinations, selecting the best-performing subset. It's computationally more intensive but considers feature interactions, leading to potentially more optimal feature subsets.

In [76]:
estimator = RandomForestClassifier()

In [None]:
k=100
rf_selector = RFE(estimator,n_features_to_select=k)
X_train_selected_rfe = rf_selector.fit_transform(X_train_selected,y_train)
X_test_selected_rfe = rf_selector.transform(X_test_selected)

selected_indices_rfe = rf_selector.get_support(indices=True)
selected_features_rfe = selected_features[selected_indices_rfe]
print(selected_features_rfe)

In [None]:
print(len(selected_features_rfe))

In [None]:
rf = RandomForestClassifier()

In [None]:
rf.fit(X_train_selected_rfe,y_train)

In [None]:
y_pred_rf = rf.predict(X_test_selected_rfe)

In [None]:
accuracy_score(y_test,y_pred_rf)

In [None]:
joblib.dump(rf,"model_rfe")

In [None]:
joblib.dump(selector,"k_best_selector")

In [None]:
joblib.dump(rf_selector,"rf_selector")

In [None]:
data_test=data_test.drop("Activity",axis=1)

In [None]:
duplicated_columns = data_test.columns[data_test.T.duplicated()].to_list()

In [None]:
data_test = data_test.drop(duplicated_columns,axis=1)

In [None]:
model = joblib.load('model_rfe')

In [None]:
selector = joblib.load('k_best_selector')

In [None]:
rf_selector = joblib.load('rf_selector')

In [None]:
selector=selector.transform(data_test)

In [None]:
X_test_selected_rfe = rf_selector.transform(selector)

In [None]:
model.predict(X_test_selected_rfe)