In [1]:
import os
import warnings  # To ignore any warnings warnings.filterwarnings("ignore")
from glob import glob  # glob uses the wildcard pattern to create an iterable object file names # containing all matching file names in the current directory.

import numpy as np  # For mathematical calculations
import pandas as pd  # For Pandas DataFrame
from scipy.stats import kurtosis, skew  # To calculate skewness, kurtosis

In [None]:
def main(train_label_df, feature):
    # files: to hold list of all datafiles to be read 
    folders=[]
    for sub in train_label_df.Datafile:
        folders.append(sub.split('/')[0])

    for folder in set(folders):
        # read all the training datasets one by one for each subject
        train_data_df = pd.DataFrame(fetch_train_data(folder , feature))
        
        # add the labels to training data
        train_data_df['activity'] = train_label_df.loc[train_label_df.Subject == folder]['Label'].to_list()
        
        # write the file with extracted features 
        train_data_df.head(1)
#         train_data_df.to_csv("dataset/processed_train/"+ folder + ".csv", index=False)

    print("Excuted Successfully")

# Here is the function to fecth the summarised training data of each subject
def fetch_train_data(folder, feature):
    # DataFrame to hold each processed dataset
    dataframe = pd.DataFrame()
    
    # filenames: holds all the activity files given subject
    file_names = glob("./../dataset/train/" + folder +"/*.csv")
    
    #read each activity file of the subject
    for file_name in file_names:
        df = pd.read_csv(file_name, header=None)
            
        #append the processed dataset to dataframe 
        dataframe = dataframe.append(extract_features(df, feature), ignore_index=True)
    
    return dataframe
    
def extract_features(df, feature):
    stats_df = pd.DataFrame()
    switcher_df = df
    stats_df = time_stats(switcher_df, feature)  
    return(stats_df.transpose())

def time_stats(switcher_df, switcher_feature):
    switcher={
        'mean': switcher_df.mean(),
        'median': switcher_df.median(),
        'min': switcher_df.min(),
        'max': switcher_df.max(),
        'std': switcher_df.std(),
        'variance': switcher_df.var(),
        'mad': switcher_df.mad(),
        'rms': np.sqrt(np.sum(np.power((switcher_df),2))/len(switcher_df)),
        'zcr': np.diff(np.signbit(switcher_df)).sum(),
        'iqr': switcher_df.quantile(0.75) - switcher_df.quantile(0.25),
        'pe': switcher_df.quantile(0.75),
        'kurtosis': kurtosis(switcher_df),
        'skew': skew(switcher_df)
     }
    return switcher.get(switcher_feature,"Invalid feature")

    
if __name__ == '__main__':
    # read labled training data
    train_label_df = pd.read_csv("./../dataset/train.csv")
    
    # Select required feature from the below set
    # {'mean','median','min','max','std','variance','mad','rms','zcr','iqr','pe','kurtosis','skew'}
    feature = 'mean'
    main(train_label_df, feature)

In [2]:
import pandas as pd
from glob import glob

train_df = pd.DataFrame()

files = glob("./../dataset/processed_train/*.csv")

for file in files:
    train_df = train_df.append(pd.read_csv(file), ignore_index=True)
    
x = train_df.drop(['activity'],axis=1)
y = train_df['activity']

(6401, 20)

In [4]:
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":
    train_df = pd.DataFrame()

    files = glob("./../dataset/processed_train/*.csv")

    for file in files:
        train_df = train_df.append(pd.read_csv(file), ignore_index=True)

    train_df["kfold"] = -1

    train_df = train_df.sample(frac=1).reset_index(drop=True)

    #create instance of stratifiedKFold
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=False, random_state=42)
    
    #assign fold values to each row in training dataset
    for fold, (train_idx, val_idx) in enumerate(kf.split(X=train_df, y=train_df.activity.values)):
        print(len(train_idx), len(val_idx))
        train_df.loc[val_idx, 'kfold'] = fold
    
    train_df.to_csv("./../dataset/train_folds.csv", index=False)

5116 1285
5118 1283
5120 1281
5122 1279
5128 1273


In [7]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import make_scorer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

FOLD_MAPPPING = {
    0: [1, 2, 3, 4],
    1: [0, 2, 3, 4],
    2: [0, 1, 3, 4],
    3: [0, 1, 2, 4],
    4: [0, 1, 2, 3]
}

FOLD = 0

if __name__ == "__main__":
    df = pd.read_csv("./../dataset/train_folds.csv")
    
    train_df = df[df.kfold.isin(FOLD_MAPPPING.get(FOLD))].reset_index(drop=True)
    valid_df = df[df.kfold==FOLD].reset_index(drop=True)

    y_train = train_df.activity.values
    y_valid = valid_df.activity.values

    train_df = train_df.drop(["activity", "kfold"], axis=1)
    valid_df = valid_df.drop(["activity", "kfold"], axis=1)

    valid_df = valid_df[train_df.columns]
    
    le = preprocessing.LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_valid = le.fit_transform(y_valid)
    
    
    model_score = make_scorer(metrics.roc_auc_score, greater_is_better=True)
    
    pipeline_lr=Pipeline([('scaler1',StandardScaler()), ('lr_classifier',LogisticRegression(random_state=42))])
    
    pipeline_randomforest=Pipeline([('scaler2',StandardScaler()), ('rf_classifier',RandomForestClassifier())])
    
    model_pipeline = [pipeline_lr, pipeline_randomforest]
    
    # Dictionary of pipelines and classifier types for ease of reference
    pipe_dict = {0: 'Logistic Regression', 1: 'RandomForest'}
    
    
    # fit the pipeline with the training data
    for pipe in model_pipeline:
         pipe.fit(train_df, y_train)
            
    for i, model in enumerate(model_pipeline):
        print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(valid_df, y_valid)))



Logistic Regression Test Accuracy: 0.48249027237354086
RandomForest Test Accuracy: 0.9245136186770428


In [18]:
import numpy as np
import pandas as pd
from sklearn import metrics, model_selection, preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

if __name__ == "__main__":
    df = pd.read_csv("dataset/final_train.csv")

    y_train = df.activity.values
    
    x_train = df.drop(["activity"], axis=1)
    
    le = preprocessing.LabelEncoder()
    
    y_train = le.fit_transform(y_train)

    pipeline_lr=Pipeline([('scaler1',StandardScaler()), ('clf',LogisticRegression(random_state=42))])
    
    pipeline_randomforest=Pipeline([('scaler2',StandardScaler()), ('clf',RandomForestClassifier())])
    
    model_pipeline = [pipeline_lr, pipeline_randomforest]
    
    # Dictionary of pipelines and classifier types for ease of reference
    pipe_dict = {0: 'Logistic Regression', 1: 'RandomForest'}
    
    parameters = [{
                     'clf__penalty': ['l2'],
                     'clf__C': np.logspace(0, 4, 10)
                    },
                    {
                     'clf__n_estimators': [10, 30],
                     'clf__max_features': [0.25, 1.0]
                    }
                   # {'estimator':[Any_other_estimator_you_want],
                   #  'estimator__valid_param_of_your_estimator':[valid_values]}

                  ]
    
    
    # fit the pipeline with the training data
    for model, pipe in zip(model_pipeline, pipe_dict):
        grid_search = GridSearchCV(estimator=model, param_grid=parameters[pipe], cv = model_selection.StratifiedKFold(n_splits=5, shuffle=False, random_state=42) )
        grid_search.fit(x_train, y_train)
#         print(grid_search.best_params_)
        print("{}: model best parameters are --> {}".format(pipe_dict[pipe], grid_search.best_params_))
#         print(grid_search.best_score_)
        print("{}: cv accuracy is  {}".format(pipe_dict[pipe], grid_search.best_score_))








{'clf__C': 21.544346900318832, 'clf__penalty': 'l2'}
0.48064894448788115
{'clf__max_features': 0.25, 'clf__n_estimators': 30}
0.9366692728694292


In [27]:
df = pd.read_csv("./../dataset/final_train.csv")

x_train = df.drop(["activity"], axis=1)

scaler = preprocessing.StandardScaler()
x_train = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

print(x_train.head())

y_train = df.activity.values

le = preprocessing.LabelEncoder()
    
y_train = le.fit_transform(y_train)

print(y_train)

          0         1         2         3         4         5         6  \
0  0.178095 -2.015409  4.183882 -0.133538  0.065940 -0.053481  0.319949   
1 -0.038229  0.559349 -0.455718 -0.138518  0.090020  0.420946  0.142741   
2 -0.045715 -0.154543  0.034852 -0.143862  0.076930 -0.044380 -1.350806   
3  0.066110 -1.742841  0.262129 -0.134323  0.079034  0.378892 -0.177211   
4  0.056878 -0.187423 -0.269038 -0.027346  0.054013 -0.283227  0.233079   

          7         8         9        10        11        12        13  \
0  0.769120  0.364714  0.415151  0.625295 -0.022404  1.158104 -0.386686   
1  0.417690 -0.509728  1.473782  0.565350 -0.067406  1.361573 -0.340664   
2 -1.426101 -0.412063 -0.148613  0.276372  0.597562 -1.907761 -0.013977   
3  1.533104 -0.019868  0.268200  0.477846  0.685203  0.067808  0.462774   
4 -1.346383  0.133122 -0.074156 -0.168438 -2.014968 -0.188881 -0.231301   

         14        15        16        17        18  
0 -0.330629 -0.143170 -0.153546 -0.215150 -0

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics, model_selection, preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

if __name__ == "__main__":
    df = pd.read_csv("dataset/final_train.csv")

    y_train = df.activity.values

    le = preprocessing.LabelEncoder()
    
    y_train = le.fit_transform(y_train)
    
    x_train = df.drop(["activity"], axis=1)
    scaler = preprocessing.StandardScaler()
    x_train = scaler.fit_transform(x_train)
#     x_train = pd.DataFrame(df)

    
    
    model_pipeline = [('clf',LogisticRegression(random_state=42)), ('clf',RandomForestClassifier()), ('clf', xgb.XGBClassifier())]
    
    # Dictionary of pipelines and classifier types for ease of reference
    pipe_dict = {0: 'Logistic Regression', 1: 'RandomForest', 2: 'XGBoost'}
    
    parameters = [{
                     'clf__penalty': ['l2'],
                     'clf__C': np.logspace(0, 4, 10)
                    },
                    {
                     'clf__n_estimators': [10, 30],
                     'clf__max_features': [0.25, 1.0]
                    },
                    {'nthread': [4], #when use hyperthread, xgboost may become slower
                    'objective': ['binary:logistic'],
                    'learning_rate': [0.05], #so called `eta` value
                    'max_depth': [6],
                    'min_child_weight': [11],
                    'silent': [1],
                    'subsample': [0.8],
                    'colsample_bytree': [0.7],
                    'n_estimators': [5], #number of trees, change it to 1000 for better results
                    'seed': [1337]}
                   # {'estimator':[Any_other_estimator_you_want],
                   #  'estimator__valid_param_of_your_estimator':[valid_values]}

                  ]
    
    
    # fit the pipeline with the training data
    for model, pipe in zip(model_pipeline, pipe_dict):
        grid_search = GridSearchCV(estimator=model, param_grid=parameters[pipe], cv = model_selection.StratifiedKFold(n_splits=5, shuffle=False, random_state=42) )
        grid_search.fit(x_train, y_train)

        print("{}: model best parameters are --> {}".format(pipe_dict[pipe], grid_search.best_params_))

        print("{}: cv accuracy is  {}".format(pipe_dict[pipe], grid_search.best_score_))


In [8]:
import os
dirname = "C:/"
os.listdir(dirname)

['$Recycle.Bin',
 '$SysReset',
 'adb',
 'Android',
 'Apps',
 'DELL',
 'dell.sdr',
 'dep',
 'Documents and Settings',
 'Drivers',
 'FIOD.manifest',
 'hiberfil.sys',
 'Intel',
 'Java',
 'MahaSecure',
 'OneDriveTemp',
 'PerfLogs',
 'Program Files',
 'Program Files (x86)',
 'ProgramData',
 'Recovery',
 'swapfile.sys',
 'System Volume Information',
 'Users',
 'Windows']