<div style="text-align: center;">
    <h1>Turo Data Cleaning Notebook</h1>
</div>


## This notebook processes raw data scraped from the Turo website, cleaning and truncating it to prepare a clean dataset ready for analysis.

---


In [30]:
# Import relevant packages
import pandas as pd
import os

---

### The raw data that was scraped from the Turo website is stored in 13 csv files inside a single folder.  I want to create a list containing the file paths for each file.

In [31]:
# Define a variable that contains the folder path
folder_path = r"F:\Data Scraping\Turo\Nashville\Octorber\Raw Excel"

# Innitiate an empty list to store the csv file paths
file_paths = []

# Iterate over all items in the specified folder path.
# If the item is a file, add it to the list 'folder_path'
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        file_paths.append(file_path)

# View the list of file paths
file_paths

['F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week15.csv',
 'F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week0.csv',
 'F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week2.csv',
 'F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week4.csv',
 'F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week6.csv',
 'F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week7.csv',
 'F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week8.csv',
 'F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week9.csv',
 'F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week10.csv',
 'F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week11.csv',
 'F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week12.csv',
 'F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week13.csv',
 'F:\\Data Scraping\\Turo\\Nashville\\Octorber\\Raw Excel\\week14.csv']

---

### Concatenate all files together into a single DataFrame

In [32]:
# Innitiate an empty list to store the data frames
cars_dfs = []

# Read in each data frame and store it in the list
for df in file_paths: 
    df = pd.read_csv(df)
    cars_dfs.append(df)

# Concatenate the data frames together
concat_df = pd.concat(cars_dfs)

# view the data frame
concat_df.head()

Unnamed: 0,vehicles.id,vehicles.completedTrips,vehicles.estimatedQuote,vehicles.hostId,vehicles.isAllStarHost,vehicles.isFavoritedBySearcher,vehicles.isNewListing,vehicles.make,vehicles.model,vehicles.rating,...,searchLocation.appliedRadius.unit,searchLocation.appliedRadius.value,searchLocation.point.lat,searchLocation.point.lng,searchLocation.topPois[0].name,searchLocation.topPois[0].locationId,searchLocation.topPois[0].shortName,searchLocation.topPois[0].type,searchLocation.topPois[0].point.lat,searchLocation.topPois[0].point.lng
0,2190039,10,,24710785,False,False,False,Volkswagen,Taos,5.0,...,MILES,6.430942,36.180583,-86.804193,Music City Star Riverfront Station,8558547.0,,TRAIN_STATION,36.16214,-86.7738
1,2182884,6,,24710785,False,False,False,Volkswagen,Tiguan,5.0,...,MILES,6.430942,36.180583,-86.804193,Music City Star Riverfront Station,8558547.0,,TRAIN_STATION,36.16214,-86.7738
2,2181949,10,,6333177,False,False,False,Mercedes-Benz,GL-Class,5.0,...,MILES,6.430942,36.180583,-86.804193,Music City Star Riverfront Station,8558547.0,,TRAIN_STATION,36.16214,-86.7738
3,2160753,12,,5849654,False,False,False,Volkswagen,Jetta,5.0,...,MILES,6.430942,36.180583,-86.804193,Music City Star Riverfront Station,8558547.0,,TRAIN_STATION,36.16214,-86.7738
4,2099638,19,,24710785,False,False,False,Chevrolet,Suburban,5.0,...,MILES,6.430942,36.180583,-86.804193,Music City Star Riverfront Station,8558547.0,,TRAIN_STATION,36.16214,-86.7738


---

### Innitial Cleaning

1. Truncate the columns of the data frame.
2. Rename the columns.
3. Remove duplicates

In [33]:
# Sub-set only the columns of interest
concat_df = concat_df[['vehicles.id', 
                      'vehicles.completedTrips',
                      'vehicles.hostId',
                      'vehicles.isAllStarHost',
                      'vehicles.isFavoritedBySearcher', 
                      'vehicles.isNewListing',
                      'vehicles.make',
                      'vehicles.model',
                      'vehicles.rating',
                      'vehicles.type',
                      'vehicles.year',
                      'vehicles.avgDailyPrice.amount',
                      'vehicles.location.city',
                      'vehicles.location.homeLocation.lat',
                      'vehicles.location.homeLocation.lng']]

# Create a dictionary of new column names
new_column_names = {'vehicles.id':'id', 
                      'vehicles.completedTrips':'trips_oct2023',
                      'vehicles.hostId':'host_id_oct2023',
                      'vehicles.isAllStarHost':'all_star_host_oct2023',
                      'vehicles.isFavoritedBySearcher':'favorited_by_searcher_oct2023', 
                      'vehicles.isNewListing':'new_listing_oct2023',
                      'vehicles.make':'make_oct2023',
                      'vehicles.model':'model_oct2023',
                      'vehicles.rating':'rating_oct2023',
                      'vehicles.type':'type_oct2023',
                      'vehicles.year':'year_oct2023',
                      'vehicles.avgDailyPrice.amount':'price_per_day_oct2023',
                      'vehicles.location.city':'city_oct2023',
                      'vehicles.location.homeLocation.lat':'lat_oct2023',
                      'vehicles.location.homeLocation.lng':'lng_oct2023'}

# Apply new column names to df
new_df = concat_df.rename(columns=new_column_names)

# drop duplicates
unique_data = new_df.drop_duplicates()

# Print the count of rows and columns for the data frame
df_rows = unique_data.shape[0]
print(f"There are {df_rows} rows in this data frame")

# Print the number of unique ids in the data frame
unique_vehicles = unique_data[['id']].drop_duplicates().shape[0]
print(f"There are {unique_vehicles} unique vehicles in the dataset")

# print count of duplicates still contained in the dataset
print(f"The dataset contains {df_rows-unique_vehicles} duplicates")

There are 4948 rows in this data frame
There are 1038 unique vehicles in the dataset
The dataset contains 3910 duplicates


---

### After dropping duplicates on the id column, the data frame still has some duplicate values

#### Because of the way the data was scraped, there are some fields that hold different values for the same vehicles:

1. **Price Per Day**
   * Hosts often give renters a discount for booking a vehicle several weeks in advance.  Therefore, the rental price of the vehicle may vary depending on how far in advance it is booked.  This data includes the rental price for a given vehicle ranging from one week in advance to 15 weeks in advance. I want to find the *average* rental price for each vehicle.
3. **Rating**
   * Because the scrapping of this data took about five hours, some renters ended their trip and provided a rating for their trip during the data scraping process.  Therefore, the same vehicle could have multiple values for the ‘Rating’ field. I want to find the *average* rating for each vehicle.
5. **Trips**
   * Because the scrapping of this data took about five hours, some renters ended their trip during the data scraping process.  Therefore, the same vehicle could have multiple values for the ‘Trip’ field. I want to find the *greatest* trip value for each vehicle.


---

Average price per day

In [34]:
# Group by id and return a df with averge price for each id
join1 = unique_data.groupby('id')['price_per_day_oct2023'].agg(average_price='mean').reset_index()

# Remove the price column from the original df and drop the duplicates
join2 = unique_data.drop(columns = ['price_per_day_oct2023']).drop_duplicates()

#Join the average price df to the original df
final_clean = pd.merge(join1, join2, how = 'inner', on = 'id')

# rename the column
final_clean = final_clean.rename(columns = {'average_price' : 'average_price_oct2023'})

# Print the count of rows and columns for the data frame
df_rows = final_clean.shape[0]
print(f"There are now {df_rows} rows in this data frame")

# Print the number of unique ids in the data frame
print(f"There are {unique_vehicles} unique vehicles in the dataset")

# print count of duplicates still contained in the dataset
print(f"The dataset contains {df_rows-unique_vehicles} duplicates")

There are now 1093 rows in this data frame
There are 1038 unique vehicles in the dataset
The dataset contains 55 duplicates


---

Average rating

In [35]:
# Group by id and return a df with averge trip rating for each id
join1 = final_clean.groupby('id')['rating_oct2023'].agg(average_rating='mean').reset_index()

# Remove the rating column from the original df and drop the duplicates
join2 = final_clean.drop(columns = ['rating_oct2023']).drop_duplicates()

# Join the average rating df to the original df
final_clean = pd.merge(join1, join2, how = 'inner', on = 'id')

# Rename the column
final_clean = final_clean.rename(columns = {'average_rating' : 'average_rating_oct2023'})

# Print the count of rows and columns for the data frame
df_rows = final_clean.shape[0]
print(f"There are now {df_rows} rows in this data frame")

# Print the number of unique ids in the data frame
print(f"There are {unique_vehicles} unique vehicles in the dataset")

# print count of duplicates still contained in the dataset
print(f"The dataset contains {df_rows-unique_vehicles} duplicates")

There are now 1082 rows in this data frame
There are 1038 unique vehicles in the dataset
The dataset contains 44 duplicates


---

If one car has two different values for 'trips', then take the larger one

In [36]:
# Group by id and return a df with max trips for each id
join1 = final_clean.groupby('id')['trips_oct2023'].max().reset_index()

# Remove the trips column from the original df and drop the duplicates
join2 = final_clean.drop(columns = ['trips_oct2023']).drop_duplicates()

# Join the trips df to the original df
final_clean = pd.merge(join1, join2, how = 'inner', on = 'id')

# rename the column
final_clean = final_clean.rename(columns = {'trips' : 'trips_oct2023'})

# Print the count of rows and columns for the data frame
df_rows = final_clean.shape[0]
print(f"There are now {df_rows} rows in this data frame")

# Print the number of unique ids in the data frame
print(f"There are {unique_vehicles} unique vehicles in the dataset")

# print count of duplicates still contained in the dataset
print(f"The dataset contains {df_rows-unique_vehicles} duplicates")

There are now 1038 rows in this data frame
There are 1038 unique vehicles in the dataset
The dataset contains 0 duplicates


---

### Cleaning city names

1. Keep only the cities that are close to Nashville
2. Fix redundancies and change improperly names cities

In [37]:
# view a unique list of cities in the dataframe
final_clean['city_oct2023'].unique()

array(['Nashville', 'Brentwood', 'Aurora', 'Murfreesboro',
       'Hendersonville', 'Franklin', 'Smyrna', 'Greenbrier',
       'Spring Hill', 'Gallatin', 'Goodlettsville', 'Lebanon',
       'Hermitage', 'Christiana', 'Antioch', 'Mt. Juliet', 'Lascassas',
       'La Vergne', 'Mount Juliet', 'Columbia', 'Forest Hills',
       "Thompson's Station", 'Nolensville', 'Arrington', 'White House',
       'Beechgrove', 'Clarksville', 'Lavergne', 'Shelbyville', 'Pegram',
       'Mt Juliet', 'Manchester', 'Whites Creek', 'Silver Point',
       'Ashland City'], dtype=object)

In [39]:
#Create a list of cities to keep in the dataset
cities_to_keep = ['Nashville', 
                  'Brentwood', 
                  'Murfreesboro',
                  'Hendersonville', 
                  'Franklin', 
                  'Smyrna', 
                  'Goodlettsville',
                  'Hermitage', 
                  'Antioch', 
                  'Mt. Juliet',
                  'La Vergne', 
                  'Mount Juliet', 
                  'Forest Hills', 
                  'Nolensville', 
                  'Arrington', 
                  'Lavergne',
                  'Mt Juliet', 
                  'Whites Creek']

#Filter for only the cities of interest
final_clean = final_clean[final_clean['city_oct2023'].isin(cities_to_keep)]

# view a unique list of the new city names
print(final_clean['city_oct2023'].unique())

['Nashville' 'Brentwood' 'Murfreesboro' 'Hendersonville' 'Franklin'
 'Smyrna' 'Goodlettsville' 'Hermitage' 'Antioch' 'Mt. Juliet' 'La Vergne'
 'Mount Juliet' 'Forest Hills' 'Nolensville' 'Arrington' 'Lavergne'
 'Mt Juliet' 'Whites Creek']


In [40]:
#Create a dictionary with new names
value_mapping = {'Mt. Juliet':'Mount Juliet',
                 'Mt Juliet':'Mount Juliet',
                 'Forest Hills':'Nashville',
                 'Antioch':'Nashville',
                 'Whites Creek':'Nashville',
                 'Hermitage':'Nashville',
                 'Lavergne':'La Vergne'}

#Apply the new names to the city column
final_clean.loc[final_clean['city_oct2023'].isin(value_mapping.keys()), 'city_oct2023'] = final_clean['city_oct2023'].replace(value_mapping)

# view a unique list of the new city names
final_clean['city_oct2023'].unique()

array(['Nashville', 'Brentwood', 'Murfreesboro', 'Hendersonville',
       'Franklin', 'Smyrna', 'Goodlettsville', 'Mount Juliet',
       'La Vergne', 'Nolensville', 'Arrington'], dtype=object)

---

### The file is now clean and ready for the analysis.

In [38]:
# Export clean data
final_clean.to_excel(r"D:\Data Scraping\Turo\Nashville\Octorber\Clean Data\clean_data.xlsx", index = False)