# Trip data files stored in a folder are combined into a single dataframe 

**Important Note-**

-All the csv files of trip data are split into two folders because of huge size to process together. 

-This spliting is done on the basis of file names. All the csv file names like 'Divvy_Trips_2015_07.csv' are stored in one      folder and the remaining csv files having names like '202004-divvy-tripdata.csv' are stored in another folder. 

-This is done for convenience of handling the data.

Folder 1 = All csv files having names like '202004-divvy-tripdata.csv'

Folder 2 = All csv files having names like 'Divvy_Trips_2015_07.csv'


In [None]:
import pandas as pd
trips_data_file= 'insert the folder path containing trip data files of type- 202004-divvy-tripdata.csv '
df_trips=[]
for f in os.listdir(trips_data_file):
    if f.endswith('.csv'):
        df_trips.append(pd.read_csv(os.path.join(trips_data_file,f)))
len(df_trips)
dftrips=pd.concat(df_trips, axis=0)
dftrips.to_csv('store it in desired folder path')
print(dftrips)

# Analyzing Trip data

## Reading '.csv' file and converting into pandas dataframe
    1.Importing dataset and converting into pandas dataframe
    2.Checking the data types

In [None]:
#reading the dataframe from pandas
filepath='insert your filepath of stored csv file given from the above cell '
data1=pd.read_csv(filepath,low_memory=False,
                  dtype={'rideable_type':'category','started_at':'string','ended_at':'string',
                         'start_station_name':'string',
                         'end_station_name':'string',
                         'member_casual':'category'})
display(data1)
display(data1.dtypes)

## Data type manipulation and removal of unwanted columns
       1.Dataset contains data type which needs to be converted
       2.Removal of some columns 
   

In [None]:
#changing the data type for better memory allocation
data1=data1.astype({'rideable_type':'category','started_at':'string',
                    'ended_at':'string','start_station_name':'string',
                    'end_station_name':'string','member_casual':'category'})
#removal of any unwanted columns
data1=data1.drop(columns=['Unnamed: 0'])
#checking for correct data types
data1.dtype

## Variables information and data cleaning
    1.removal of any duplicates and checking the count of na values

In [None]:
#getting to know variables present in the dataset
data1.info()
#finding any duplicates and remove them
data1=data1.drop_duplicates()
#getting to know the na values accross columns
data1.isna().sum()

In [27]:
#finding the na values in the dataframe across columns 
data1.isna().sum()
#total na values in dataframe
display(data1.isna().sum().sum())

## Removal of all Na values
    1. Total na values are approximately 3 million records which are removed 
    

In [None]:
cleandf=data1.dropna()
display(cleandf.isna().sum())

## converting time column's dtype into relevant datetime dtype and then formatting  
    1.converting to 'datetime' dtype allows better handling of time columns into specific time format
    2.It was seen that the cleaned dataframe had time columns with two separate time formats 
    3.So inorder to solve this issue, the cleaned dataframe was divided into 2 subset of dataframes with separate time format as there is some inconsistency in time format  
    4.One of the subset with columns-'started_at' and 'ended_at' has different time format which needs to be corrected to '%Y-%m-%d %H:%M:%S'
    5.This can be done by first converting dtype of those time columns into 'datetime64[ns]' and then formatting into '%Y-%m-%d %H:%M:%S'
    6.The second subset contains time columns with correct time format and only requires converion of dtype into 'datetime64[ns]' dtype 
    7.later two subset dataframes can be concatenated into one single dataframe with consistent formatting

In [None]:
#selecting a subset of dataframe with('ended_at'&'started_at')columns to convert into datetime 
newdf1=cleandf[(cleandf['started_at'].str.contains('/')) & (cleandf['ended_at'].str.contains('/'))]

#converting the ('started_at'&'ended_at')columns into datetime dtype
newdf1[['started_at','ended_at']]=pd.to_datetime(newdf1[['started_at','ended_at']].stack(),
        infer_datetime_format=True,format='%Y-%m-%d %H:%M:%S').unstack()

#adjusting the('started_at'&'ended_at')columns with datetime format    
newdf1[['started_at','ended_at']]=newdf1[['started_at','ended_at']].dt.strftime('%Y-%m-%d %H:%M:%S')

#displaying the subset dataframe with correct datetime format
display(newdf1)

In [None]:
#selecting a subset of dataframe with('ended_at'&'started_at')column to convert into datetime 
newdf2=cleandf[(cleandf['started_at'].str.contains('-')) & (cleandf['ended_at'].str.contains('-'))]

#converting the ('ended_at'&'ended_at') column into datetime dtype
newdf2[['started_at','ended_at']]=pd.to_datetime(newdf2[['started_at','ended_at']].stack(),
                           infer_datetime_format=True,format='%Y-%m-%d %H:%M:%S').unstack()

#displaying the subset dataframe with correct datetime format
display(newdf2)

#combining two dataframes with same time formats 
newdf=pd.concat([newdf1,newdf2])

## Adding new columns to calculate trip duration and trip distance
    1.'trip_duration' column is added by subtracting relevant time columns and showing duration in hours
    2.'trip_distance' column is added by calculating distance between two geographical coordintes using haversine distance         formula by importing relevant library 'mpu' and units are in kms

In [101]:
#adding column for calculating time duration for the trip
newdf['trip_duration']=(newdf.ended_at-newdf.started_at).dt.total_seconds()/3600

#sorting the dataframe with time 
new1=newdf.sort_values(['started_at','ended_at']).reset_index(drop=True)

#adding column for calculating distance between two places(haversine dist)
new1['trip_distance']=list((map(lambda x,y :mpu.haversine_distance(x,y),zip(new1['start_lat'],
                    new1['start_lng']),zip(new1['end_lat'],new1['end_lng']))))

#rounding off distance to two places
new1['trip_distance']=new1['trip_distance'].round(2)

## Completing data cleaning,manipulation,pre-processing steps to store data into new file 

In [107]:
new1.to_csv('filepath of csv file to store in local with names like 2020-2023.csv ') #contains the trip data from 2020-2023