In [None]:
#  Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Load the CSV file
# Replace 'your_file.csv' with the path to your CSV file
df = pd.read_csv('./data/raw/uber_fares_dataset/uber.csv')
# print("Dataset Shape:", df.shape)
# print("\nColumn Names:")
# print(df.columns.tolist())
# print("\nData Types:")
# print(df.dtypes)
# print("\nFirst 6 rows:")
# print(df.head(6))

print("="*60)
print("UBER FARES DATASET - INITIAL DATA EXPLORATION")
print("="*60)

# 1. BASIC DATASET INFORMATION
print("\n1. DATASET STRUCTURE AND DIMENSIONS")
print(f"Dataset Shape: {df.shape}")
print(f"Total Records: {df.shape[0]:,}")
print(f"Total Columns: {df.shape[1]}")

print("\n2. COLUMN NAMES AND DATA TYPES")
print(df.info())
print("\n3. FIRST 10 ROWS")
print(df.head(10)) #by default, head() shows the first 5 rows, but we specify 10 here


UBER FARES DATASET - INITIAL DATA EXPLORATION

1. DATASET STRUCTURE AND DIMENSIONS
Dataset Shape: (200000, 9)
Total Records: 200,000
Total Columns: 9

2. COLUMN NAMES AND DATA TYPES
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB
None

3. FIRST 5 ROWS
   Unnamed: 0                            key  fare_amount  \
0    24238194    2015-0

In [10]:
print("\n4. LAST 10 ROWS")
print(df.tail(10))


4. LAST 10 ROWS
        Unnamed: 0                            key  fare_amount  \
199990     9577367    2015-05-24 22:05:56.0000002         12.0   
199991    13512837    2015-06-08 10:49:14.0000001         17.5   
199992    20566507  2010-01-30 16:24:00.000000199          8.9   
199993    28359558    2012-09-29 19:51:27.0000006          9.5   
199994     3189201  2014-01-31 14:42:00.000000181         12.0   
199995    42598914   2012-10-28 10:49:00.00000053          3.0   
199996    16382965    2014-03-14 01:09:00.0000008          7.5   
199997    27804658   2009-06-29 00:42:00.00000078         30.9   
199998    20259894    2015-05-20 14:56:25.0000004         14.5   
199999    11951496   2010-05-15 04:08:00.00000076         14.1   

                pickup_datetime  pickup_longitude  pickup_latitude  \
199990  2015-05-24 22:05:56 UTC        -73.987106        40.741894   
199991  2015-06-08 10:49:14 UTC        -73.981453        40.743919   
199992  2010-01-30 16:24:00 UTC        -74.003

In [25]:
print("More Information:", df.info())
print("Description:", df.describe())
print("Head:", df.head())
print("Tail:", df.tail())
print("Shape:", df.shape)
print("Columns:", df.columns)
print("Data Types:", df.dtypes)
print("Missing Values:", df.isnull().sum())
print("Unique Values:", df.nunique())
print("Data Types:", df.dtypes)

print("\n5. COLUMN NAMES LIST")
column_names = df.columns.tolist()
for i, col in enumerate(column_names, 1):
    print(f"{i}. {col}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB
More Information: None
Description:          Unnamed: 0    fare_amount  pickup_longitude  pickup_latitude  \
count  2.000000e+05  200000.000000     200000.000000    200000.000000   
mean   2.771250e+07      11.359955        -72.527638        39.935885   
std    1.601382e+07       9.90177

In [None]:
# 2. DATA QUALITY ASSESSMENT
print("\n" + "="*60)
print("DATA QUALITY ASSESSMENT")
print("="*60)

print("\n1. MISSING VALUES ANALYSIS")
missing_data = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_summary = pd.DataFrame({
    'Column': missing_data.index,
    'Missing_Count': missing_data.values,
    'Missing_Percentage': missing_percent.values
})
missing_summary = missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
print(missing_summary)
if len(missing_summary) == 0:
    print("✅ No missing values found!")

print("\n2. DUPLICATE RECORDS")
duplicates = df.duplicated().sum()
print(f"Number of duplicate records: {duplicates}")
if duplicates > 0:
    print(f"Percentage of duplicates: {(duplicates/len(df))*100:.2f}%")
else:
    print("✅ No duplicate records found!")

print("\n3. DATA TYPES SUMMARY")
dtype_summary = df.dtypes.value_counts()
print(dtype_summary)


DATA QUALITY ASSESSMENT

1. MISSING VALUES ANALYSIS
              Column  Missing_Count  Missing_Percentage
6  dropoff_longitude              1              0.0005
7   dropoff_latitude              1              0.0005

2. DUPLICATE RECORDS
Number of duplicate records: 0
✅ No duplicate records found!

3. DATA TYPES SUMMARY
float64    5
int64      2
object     2
Name: count, dtype: int64


In [32]:
# 3. NUMERICAL COLUMNS ANALYSIS
print("\n" + "="*60)
print("NUMERICAL COLUMNS ANALYSIS")
print("="*60)

# Identify numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumerical columns ({len(numerical_cols)}): {numerical_cols}")

if len(numerical_cols) > 0:
    print("\n1. DESCRIPTIVE STATISTICS")
    print(df[numerical_cols].describe())
    
    # Focus on fare amount if it exists
    fare_columns = [col for col in numerical_cols if 'fare' in col.lower() or 'amount' in col.lower() or 'price' in col.lower()]
    if fare_columns:
        print(f"\n2. FARE AMOUNT DETAILED ANALYSIS")
        for fare_col in fare_columns:
            print(f"\n--- {fare_col.upper()} ---")
            print(f"Mean: ${df[fare_col].mean():.2f}")
            print(f"Median: ${df[fare_col].median():.2f}")
            print(f"Mode: ${df[fare_col].mode().iloc[0]:.2f}")
            print(f"Standard Deviation: ${df[fare_col].std():.2f}")
            print(f"Minimum: ${df[fare_col].min():.2f}")
            print(f"Maximum: ${df[fare_col].max():.2f}")
            print(f"Q1 (25th percentile): ${df[fare_col].quantile(0.25):.2f}")
            print(f"Q3 (75th percentile): ${df[fare_col].quantile(0.75):.2f}")
            print(f"IQR: ${df[fare_col].quantile(0.75) - df[fare_col].quantile(0.25):.2f}")
            
            # Outlier detection using IQR method
            Q1 = df[fare_col].quantile(0.25)
            Q3 = df[fare_col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = df[(df[fare_col] < lower_bound) | (df[fare_col] > upper_bound)]
            print(f"Potential outliers: {len(outliers)} ({(len(outliers)/len(df))*100:.2f}%)")



NUMERICAL COLUMNS ANALYSIS

Numerical columns (7): ['Unnamed: 0', 'fare_amount', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']

1. DESCRIPTIVE STATISTICS
         Unnamed: 0    fare_amount  pickup_longitude  pickup_latitude  \
count  2.000000e+05  200000.000000     200000.000000    200000.000000   
mean   2.771250e+07      11.359955        -72.527638        39.935885   
std    1.601382e+07       9.901776         11.437787         7.720539   
min    1.000000e+00     -52.000000      -1340.648410       -74.015515   
25%    1.382535e+07       6.000000        -73.992065        40.734796   
50%    2.774550e+07       8.500000        -73.981823        40.752592   
75%    4.155530e+07      12.500000        -73.967154        40.767158   
max    5.542357e+07     499.000000         57.418457      1644.421482   

       dropoff_longitude  dropoff_latitude  passenger_count  
count      199999.000000     199999.000000    200000.000000  
mean      