# Data Encoding

In [24]:
# Packages for loading, cleaning, visualization, and analysis

# Data
import pandas as pd
import numpy as np
import scipy as sp
import os
import string as st




### In Class Exercise  -  Save the Cleaned Data Frame

Copy the cells from your Data Engineering 3 - Data Cleaning notebook to this notebook. Only copy the cells that you use to clean the data (i.e., if you use imputation rather than row deletion copy the imputation cells and not the row deletion cells). 

Run the code in the cells and then save your results, the cleaned data frame, to your local or cloud storage. 

Read the data to validate that you correctly saved your data.

In [25]:
# Function to get the files from a directory

def getallfiles(directory, extension = ".txt"):
    '''Get all files in directory with the specified extension
        and put them into a list.
        The default extension is txt. The directory parameter is the path to 
        the directory containing the files.'''
    filenames = os.listdir(directory)
    myfiles = []
    for e in filenames:
        if e.endswith(extension):
            myfiles.append(os.path.realpath(e))
    return myfiles



In [26]:
def createlist(directory, extension = ".txt"):
    '''Put all files in the specified directory
    with the chosen extension (txt is the default) 
    into a datafame'''
    os.chdir(directory)
    files = getallfiles(directory)
    filelist = []
    for i,file in enumerate(files):
        filelist.append(pd.read_csv(os.path.realpath(file), low_memory = False, encoding = "ISO-8859-1"))
    return(filelist)

In [50]:
# Create the data frame of all accidents

path = "/Users/Pan/Google Drive/Data Science/DS6001"
acts = createlist(path)
accidents_df = pd.concat(acts,ignore_index=True)
accidents_df.shape

(51623, 153)

In [51]:
# Join the narrative and put them in a list

def join_narratives(DF):
    '''With the input of the accident dataframe
    merge the narrative columns into a single narrative
    and return a list of these single narratives for each
    accident report in the dataframe. '''
    narrlist = []
    for i in range(0,15):
        a = str(i+1)
        narrlist.append('NARR'+ a)
    RailNarr = DF.loc[:, narrlist]
    Narratives = []
    for i, _ in enumerate(RailNarr["NARR1"]):
        NarrativeList = RailNarr.iloc[i]
        Anarrative = ""
        for narr in NarrativeList:
            if pd.isnull(narr):
                break
            else:
                Anarrative += str(narr)
        Narratives.append(Anarrative)
    return (Narratives)

In [52]:
narrative_list = join_narratives(accidents_df)
accidents_df["Narrative"] = narrative_list

#Check by looking at narrative 3
accidents_df["Narrative"][3]

'LUS31-03 ATTEMPTING TO DEPART YARD IN OGDEN WITH 164 CARS AND 16,000 TONS ON A 13 DEGREE CURVE.  ENGINEER NOTCHED UP PER THE SEQUENCE AND WENT TO NOTCH 6 AND HELD AT NOTCH 6 FOR 12 SECONDS BUT WAS UNABLE TO MOVE THE TRAIN.  UPON INSPECTION DISCOVERED THAT 10 CARS HAD STRING LINED AND DERAILED.'

In [53]:
# Drop the old narrative columns
narrlist = []
for i in range(0,15):
    a = str(i+1)
    narrlist.append('NARR'+a)
accidents_df.drop(narrlist,axis =1,inplace=True)

In [54]:
# Look at 3 approaches to removing duplicates

#Removing duplicates
accidents_clean_df = accidents_df.drop_duplicates(['YEAR','DAY','MONTH','TIMEHR'])
print(accidents_clean_df.shape)

#Using AMPM
accidents_clean_df = accidents_df.drop_duplicates(['YEAR','DAY','MONTH','TIMEHR','AMPM'])
print(accidents_clean_df.shape)

# Using the FRA FAQ (look at the source of the data)
accidents_clean_df = accidents_df[(accidents_df['JOINTCD']==1)&(accidents_df['TYPE']!=7)]
print(accidents_clean_df.shape)

(30677, 139)
(35453, 139)
(38167, 139)


In [55]:
accidents_clean_df = accidents_clean_df.dropna(axis=1,thresh = (38167-1500))

In [56]:

from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.
        
        Columns of dtype floating point are imputed with the mean.

        Columns of other types are imputed with median of the column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') 
                               else X[c].mean() if X[c].dtype == np.dtype('f')
                                else X[c].median() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


In [57]:
accidents_clean_df = DataFrameImputer().fit_transform(accidents_clean_df)


In [58]:
accidents_clean_df.shape

(38167, 88)

In [59]:
#4. Replace values TYPE
accidents_clean_df['TYPE']=accidents_clean_df['TYPE'].replace(range(1,14),
                                    ["Defailment","Headon","Rearend","Side","Raking",
                                    "BrokenTrain","Hwy-Rail","GradeX","Obstruction",
                                    "Explosive","Fire","Other","SeeNarrative"])
accidents_clean_df['TYPE'].value_counts()

Defailment      27429
Other            4321
SeeNarrative     2127
Side             1603
Obstruction       991
Raking            720
Fire              454
Rearend           298
Headon            125
BrokenTrain        80
Explosive          14
GradeX              5
Name: TYPE, dtype: int64

In [60]:
# Map for TYPEQ
# Taken from Rail Equipment Accident/ Incident Report Form

map_typeq = {1:"Freight",2:"PassengerPull",3:"Commuter",4:"Work",5:"Single",
            6:"CutofCars",7:"Yard",8:"Light",9:"Maint",'A':"Maint of Way",
            '1':"Freight",'2':"PassengerPull",'3':'Commuter','4':"Work",'5':"Single",
            '6':"CutofCars",'7':"Yard",'8':"Light",'9':"Maint",'B':"PassengerPush",'C':"CommuterPush",
            'D':"EMU",'E':"DMU"}
accidents_clean_df['TYPEQ'] = accidents_clean_df['TYPEQ'].map(map_typeq)
accidents_clean_df['TYPEQ'].value_counts()

Freight          17988
Yard             12910
CutofCars         1950
Light             1719
PassengerPull     1188
Single            1048
Commuter           363
Maint of Way       311
Maint              254
Work               252
EMU                101
CommuterPush        38
PassengerPush       28
DMU                 17
Name: TYPEQ, dtype: int64

Narrative mess it up,since it has comma.

JSON intro: www.json.org

In [61]:
path = "/Users/Pan/Google Drive/Data Science/DS6001/"
file="TrainAccidents_Clean.csv"
accidents_df=pd.read_csv(path+file,low_memory=False)
accidents_df.shape

FileNotFoundError: File b'/Users/Pan/Google Drive/Data Science/DS6001/TrainAccidents_Clean.csv' does not exist

In [62]:
#now save the Narrative
#first create the dictionary of narratives
#note: JSON will not allow integer values for indices

str_index=[str(x) for x in accidents_clean_df.index]

Narrative_dict=dict(zip(str_index,accidents_clean_df.Narrative))

In [63]:
import json
path = "/Users/Pan/Google Drive/Data Science/DS6001/"
file="TrainAccidents_Clean.txt"
with open(path+file,"w")as destination:
    json.dump(Narrative_dict,destination)

In [65]:
#Read the JSON file
#look to see if it is correct

with open (path+file) as json_file:

    Narrative_dict=json.load(json_file)

str_index = [int(x) for x in Narrative_dict.keys()]
Narrative_dict=dict(zip(str_index,Narrative_dict.values()))
[Narrative_dict.get(key) for key in(0,1)]
    

['Y-BIR3041-06 WHILE PULLING 29 CARS FROM 4101 TRACK DERAILED 2 CARS DUE TO SOFT ROADED. CUT CONTAINEDARTICULATED EQUIPMENT.',
 'UECDAD-01 PULLING EMPTY BOX CARS FROM STORAGE, COUPLED TO FIRST CARS AND PROCEEDED TO MAKE 3 JOINTSSHOVING BACK AND STRETCH ALL JOINTS MADE.  AFTER COUPLING ALL THE CARS, TRAIN STARTED TO MOVE NORTHAND AIR WENT INTO EMERGENCY AND WOULD NOT RECOVER.  INSPECTION REVEALED 6 CARS DERAILED.  INVESTIGATION REVEALED A 65 FOOT TREE HAD FALLEN AGAINST THE WEST SIDE OF THE TRAIN AND AS THE CARS MOVED THETREE TOP FELL IN BETWEEN THE CARS CAUSING THEM TO DERAIL.']

In [66]:
file="TrainAccidents_Clean.csv"
accidents_noNarrative_df = accidents_clean_df.drop("Narrative",axis=1)
accidents_noNarrative_df.to_csv(path+file,index=False)

In [68]:
#load the data to see if it is the same
file="TrainAccidents_Clean.csv"
accidents_df=pd.read_csv(path+file,low_memory=False)
accidents_df.shape

(38167, 87)

In [69]:
accidents_df['Narrative']=Narrative_dict.values()
accidents_df.shape

(38167, 88)