In [1]:
import pandas as pd
import numpy
import os
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# to read the file from the location and process it to save it in a 2D list, with removing excess symbols like (';')
# each row in the list comprises of a separate data instance
def process(path_to_folder):
    train = []
    for root, dirs, files in os.walk(path_to_folder):
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r') as f:
                    text = f.read()
                    temp = text.split(';\n')
                    final = []
                    for i in range (len(temp)):
                        a = temp[i].split(',')
                        final.append(a)
                
                    train = train[:] + final
                    
    return train

In [3]:
trainphoneaccel = process('train/phone/accel')
trainphonegyro = process('train/phone/gyro')
trainwatchaccel = process('train/watch/accel')
trainwatchgyro = process('train/watch/gyro')

In [5]:
trainphoneaccel[:10]

[['1600', 'A', '252207666810782', '-0.36476135', '8.793503', '1.0550842'],
 ['1600', 'A', '252207717164786', '-0.8797302', '9.768784', '1.0169983'],
 ['1600', 'A', '252207767518790', '2.0014954', '11.10907', '2.619156'],
 ['1600', 'A', '252207817872794', '0.45062256', '12.651642', '0.18455505'],
 ['1600', 'A', '252207868226798', '-2.1643524', '13.928436', '-4.4224854'],
 ['1600', 'A', '252207918580802', '-4.332779', '13.361191', '-0.7188721'],
 ['1600', 'A', '252207968934806', '-0.31944275', '13.318359', '-0.23202515'],
 ['1600', 'A', '252208019288809', '1.566452', '9.515274', '-0.01777649'],
 ['1600', 'A', '252208069642813', '-0.32374573', '5.262665', '0.32234192'],
 ['1600', 'A', '252208119996817', '-1.811676', '3.7105103', '1.3739319']]

In [7]:
train = trainphoneaccel + trainphonegyro + trainwatchaccel + trainwatchgyro
len(train)

5575090

In [8]:
def transform(data):
    data = data[:-1]
    data = pd.DataFrame(data, columns = ['Subject-id', 'Activity Label', 'Timestamp', 'x', 'y', 'z'])
    return data

In [9]:
train = transform(train)

In [12]:
print (train.shape)
train.head()

(5575089, 6)


Unnamed: 0,Subject-id,Activity Label,Timestamp,x,y,z
0,1600,A,252207666810782,-0.36476135,8.793503,1.0550842
1,1600,A,252207717164786,-0.8797302,9.768784,1.0169983
2,1600,A,252207767518790,2.0014954,11.10907,2.619156
3,1600,A,252207817872794,0.45062256,12.651642,0.18455505
4,1600,A,252207868226798,-2.1643524,13.928436,-4.4224854


In [25]:
# convert the elements of the dataframe from string to numeric
train.Timestamp = pd.to_numeric(train.Timestamp, errors = 'coerce')


In [14]:
# removing all the null values from the dataframe
train = train.dropna(subset = ['Subject-id','Timestamp', 'Activity Label','x', 'y', 'z'])

In [15]:
label = train['Activity Label'].unique()
label

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'O', 'P', 'Q', 'R', 'S'], dtype=object)

In [18]:
l={}
n=0
for i in label:
    l[i] = n+1
    n+=1
train['Activity Label'] = train['Activity Label'].apply(lambda x: l[x])
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Subject-id,Activity Label,Timestamp,x,y,z
0,1600,1,252207666810782,-0.36476135,8.793503,1.0550842
1,1600,1,252207717164786,-0.8797302,9.768784,1.0169983
2,1600,1,252207767518790,2.0014954,11.10907,2.619156
3,1600,1,252207817872794,0.45062256,12.651642,0.18455505
4,1600,1,252207868226798,-2.1643524,13.928436,-4.4224854


In [26]:
train['Timestamp'] = train['Timestamp'].apply(lambda x: x//1000000)
train['Timestamp'] = train['Timestamp'].apply(lambda x: datetime.fromtimestamp(x))

train.drop(columns="Subject-id",inplace=True)

train.head()

Unnamed: 0,Activity Label,Timestamp,x,y,z
0,1,1977-12-29 07:11:06,-0.36476135,8.793503,1.0550842
1,1,1977-12-29 07:11:57,-0.8797302,9.768784,1.0169983
2,1,1977-12-29 07:12:47,2.0014954,11.10907,2.619156
3,1,1977-12-29 07:13:37,0.45062256,12.651642,0.18455505
4,1,1977-12-29 07:14:28,-2.1643524,13.928436,-4.4224854


In [27]:
for time in ('year','month','week','day','hour','minute','second'):
    train[time] = getattr(train['Timestamp'].dt,time)
train.drop(columns="Timestamp",inplace=True)

train.head()

Unnamed: 0,Activity Label,x,y,z,year,month,week,day,hour,minute,second
0,1,-0.36476135,8.793503,1.0550842,1977,12,52,29,7,11,6
1,1,-0.8797302,9.768784,1.0169983,1977,12,52,29,7,11,57
2,1,2.0014954,11.10907,2.619156,1977,12,52,29,7,12,47
3,1,0.45062256,12.651642,0.18455505,1977,12,52,29,7,13,37
4,1,-2.1643524,13.928436,-4.4224854,1977,12,52,29,7,14,28


In [33]:
train = train.sample(frac=1).reset_index(drop=True)

data=pd.DataFrame(train)

In [29]:
train.head()

Unnamed: 0,Activity Label,x,y,z,year,month,week,day,hour,minute,second
0,15,-0.05848694,0.091308594,-0.019226074,1970,8,34,17,16,47,48
1,17,-1.2444172,-2.751247,-0.4123359,1995,6,22,2,8,55,43
2,3,-5.28006,-8.494171,0.8475189,1970,2,8,19,2,11,23
3,17,0.29934692,0.1472168,0.074905396,1979,4,16,16,11,26,13
4,9,0.483307,0.20546687,0.13411254,1970,4,16,15,14,3,22


In [None]:
dada

In [34]:
y=data['Activity Label']
x=data.drop(columns="Activity Label")
x_train, x_test, y_train, y_test = train_test_split(x,y , train_size = 0.7, random_state =  42)

In [35]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

KeyboardInterrupt: 

In [None]:
train.shape()