In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("AB_NYC_2019.csv")

# Display basic info before cleaning
print("Initial Data Info:")
print(df.info())

# Drop duplicates
df.drop_duplicates(inplace=True)

# Handle missing values
df.drop(columns=['name', 'host_name'], inplace=True)  # Dropping non-essential columns with missing values
df['last_review'] = pd.to_datetime(df['last_review'])  # Convert last_review to datetime
df['reviews_per_month'].fillna(0, inplace=True)  # Fill NaN in reviews_per_month with 0

# Remove unrealistic values
df = df[(df['price'] > 0) & (df['price'] < 10000)]  # Remove outliers in price
df = df[df['minimum_nights'] <= 365]  # Remove extreme booking lengths

# Reset index
df.reset_index(drop=True, inplace=True)

# Display cleaned data info
print("Cleaned Data Info:")
print(df.info())

# Save cleaned data
df.to_csv("AB_NYC_2019_cleaned.csv", index=False)
print("Data cleaning completed and saved as AB_NYC_2019_cleaned.csv")


Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['reviews_per_month'].fillna(0, inplace=True)  # Fill NaN in reviews_per_month with 0


Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48867 entries, 0 to 48866
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              48867 non-null  int64         
 1   host_id                         48867 non-null  int64         
 2   neighbourhood_group             48867 non-null  object        
 3   neighbourhood                   48867 non-null  object        
 4   latitude                        48867 non-null  float64       
 5   longitude                       48867 non-null  float64       
 6   room_type                       48867 non-null  object        
 7   price                           48867 non-null  int64         
 8   minimum_nights                  48867 non-null  int64         
 9   number_of_reviews               48867 non-null  int64         
 10  last_review                     38825 non-null  dat