In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load both datasets
sf_df = pd.read_csv('../../data/raw/san francisco.csv')
sd_df = pd.read_csv('../../data/raw/san diego.csv')

print("="*80)
print("T1.1: INITIAL DATA EXPLORATION")
print("="*80)

print("\n" + "="*80)
print("1. DATASET OVERVIEW")
print("="*80)

print(f"\nüìä San Francisco Dataset:")
print(f"   - Rows: {sf_df.shape[0]:,}")
print(f"   - Columns: {sf_df.shape[1]}")

print(f"\nüìä San Diego Dataset:")
print(f"   - Rows: {sd_df.shape[0]:,}")
print(f"   - Columns: {sd_df.shape[1]}")

print(f"\nüìä Combined Dataset (if merged):")
print(f"   - Total Rows: {sf_df.shape[0] + sd_df.shape[0]:,}")

# Check if columns match
sf_cols = set(sf_df.columns)
sd_cols = set(sd_df.columns)
print(f"\n‚úì Column names match: {sf_cols == sd_cols}")

if sf_cols != sd_cols:
    print(f"   - Columns only in SF: {sf_cols - sd_cols}")
    print(f"   - Columns only in SD: {sd_cols - sf_cols}")

print("\n" + "="*80)
print("2. COLUMN STRUCTURE")
print("="*80)
print(f"\nTotal Columns: {sf_df.shape[1]}")
print("\nColumn Names:")
for i, col in enumerate(sf_df.columns, 1):
    print(f"{i:2d}. {col}")

print("\n" + "="*80)
print("3. DATA TYPES ANALYSIS")
print("="*80)

# Analyze data types
sf_dtypes = sf_df.dtypes.value_counts()
print("\nüìã San Francisco Data Types:")
for dtype, count in sf_dtypes.items():
    print(f"   - {dtype}: {count} columns")

sd_dtypes = sd_df.dtypes.value_counts()
print("\nüìã San Diego Data Types:")
for dtype, count in sd_dtypes.items():
    print(f"   - {dtype}: {count} columns")

print("\n" + "="*80)
print("4. MISSING VALUES ANALYSIS - SAN FRANCISCO")
print("="*80)

sf_missing = sf_df.isnull().sum()
sf_missing_pct = (sf_missing / len(sf_df) * 100).round(2)
sf_missing_df = pd.DataFrame({
    'Column': sf_missing.index,
    'Missing_Count': sf_missing.values,
    'Missing_Percentage': sf_missing_pct.values
})
sf_missing_df = sf_missing_df[sf_missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

print(f"\nColumns with Missing Values: {len(sf_missing_df)}/{len(sf_df.columns)}")
print("\nTop 20 Columns with Most Missing Values:")
print(sf_missing_df.head(20).to_string(index=False))

print("\n" + "="*80)
print("5. MISSING VALUES ANALYSIS - SAN DIEGO")
print("="*80)

sd_missing = sd_df.isnull().sum()
sd_missing_pct = (sd_missing / len(sd_df) * 100).round(2)
sd_missing_df = pd.DataFrame({
    'Column': sd_missing.index,
    'Missing_Count': sd_missing.values,
    'Missing_Percentage': sd_missing_pct.values
})
sd_missing_df = sd_missing_df[sd_missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

print(f"\nColumns with Missing Values: {len(sd_missing_df)}/{len(sd_df.columns)}")
print("\nTop 20 Columns with Most Missing Values:")
print(sd_missing_df.head(20).to_string(index=False))

print("\n" + "="*80)
print("6. KEY NUMERICAL FEATURES SUMMARY - SAN FRANCISCO")
print("="*80)

# Key numerical columns to analyze
key_numerical = ['price', 'accommodates', 'bedrooms', 'beds', 'bathrooms', 
                 'minimum_nights', 'maximum_nights', 'number_of_reviews',
                 'review_scores_rating', 'review_scores_accuracy', 
                 'review_scores_cleanliness', 'review_scores_checkin',
                 'review_scores_communication', 'review_scores_location',
                 'review_scores_value']

# Check which columns exist
existing_numerical = [col for col in key_numerical if col in sf_df.columns]

print("\nüìä San Francisco - Key Numerical Features:")
sf_summary = sf_df[existing_numerical].describe().T
sf_summary['missing'] = sf_df[existing_numerical].isnull().sum()
sf_summary['missing_pct'] = (sf_summary['missing'] / len(sf_df) * 100).round(2)
print(sf_summary[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'missing', 'missing_pct']].to_string())

print("\n" + "="*80)
print("7. KEY NUMERICAL FEATURES SUMMARY - SAN DIEGO")
print("="*80)

print("\nüìä San Diego - Key Numerical Features:")
sd_summary = sd_df[existing_numerical].describe().T
sd_summary['missing'] = sd_df[existing_numerical].isnull().sum()
sd_summary['missing_pct'] = (sd_summary['missing'] / len(sd_df) * 100).round(2)
print(sd_summary[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'missing', 'missing_pct']].to_string())

print("\n" + "="*80)
print("8. KEY CATEGORICAL FEATURES ANALYSIS")
print("="*80)

key_categorical = ['property_type', 'room_type', 'neighbourhood_cleansed', 
                   'host_is_superhost', 'instant_bookable']

existing_categorical = [col for col in key_categorical if col in sf_df.columns]

print("\nüìä San Francisco - Categorical Features:")
for col in existing_categorical:
    unique_count = sf_df[col].nunique()
    missing = sf_df[col].isnull().sum()
    print(f"\n{col}:")
    print(f"   - Unique values: {unique_count}")
    print(f"   - Missing: {missing} ({missing/len(sf_df)*100:.2f}%)")
    if unique_count <= 10:
        print(f"   - Value counts:")
        print(sf_df[col].value_counts().head(10).to_string())

print("\nüìä San Diego - Categorical Features:")
for col in existing_categorical:
    unique_count = sd_df[col].nunique()
    missing = sd_df[col].isnull().sum()
    print(f"\n{col}:")
    print(f"   - Unique values: {unique_count}")
    print(f"   - Missing: {missing} ({missing/len(sd_df)*100:.2f}%)")
    if unique_count <= 10:
        print(f"   - Value counts:")
        print(sd_df[col].value_counts().head(10).to_string())

print("\n" + "="*80)
print("9. PRICE ANALYSIS (NEEDS CLEANING)")
print("="*80)

print("\nüìä San Francisco - Price Column:")
print(f"   - Data type: {sf_df['price'].dtype}")
print(f"   - Sample values: {sf_df['price'].head(10).tolist()}")
print(f"   - Missing: {sf_df['price'].isnull().sum()}")

print("\nüìä San Diego - Price Column:")
print(f"   - Data type: {sd_df['price'].dtype}")
print(f"   - Sample values: {sd_df['price'].head(10).tolist()}")
print(f"   - Missing: {sd_df['price'].isnull().sum()}")

print("\n‚ö†Ô∏è  Note: Price column is stored as string with '$' and ',' - needs cleaning in T1.2")

print("\n" + "="*80)
print("10. DATA QUALITY ISSUES IDENTIFIED")
print("="*80)

issues = []

# Check for duplicates
sf_dupes = sf_df.duplicated().sum()
sd_dupes = sd_df.duplicated().sum()
if sf_dupes > 0 or sd_dupes > 0:
    issues.append(f"Duplicate rows: SF={sf_dupes}, SD={sd_dupes}")

# Check price format
if sf_df['price'].dtype == 'object':
    issues.append("Price column needs cleaning (contains '$' and ',')")

# Check high missing value columns
high_missing_sf = sf_missing_df[sf_missing_df['Missing_Percentage'] > 50]
high_missing_sd = sd_missing_df[sd_missing_df['Missing_Percentage'] > 50]
issues.append(f"Columns with >50% missing: SF={len(high_missing_sf)}, SD={len(high_missing_sd)}")

# Check for columns with all missing
all_missing_sf = sf_missing_df[sf_missing_df['Missing_Percentage'] == 100]
all_missing_sd = sd_missing_df[sd_missing_df['Missing_Percentage'] == 100]
if len(all_missing_sf) > 0 or len(all_missing_sd) > 0:
    issues.append(f"Columns with 100% missing: SF={len(all_missing_sf)}, SD={len(all_missing_sd)}")

print("\nüîç Issues Found:")
for i, issue in enumerate(issues, 1):
    print(f"{i}. {issue}")

print("\n" + "="*80)
print("T1.1 SUMMARY")
print("="*80)

print(f"""
‚úÖ Task T1.1 Completed: Initial Data Exploration

üìä Dataset Overview:
   - San Francisco: {sf_df.shape[0]:,} rows √ó {sf_df.shape[1]} columns
   - San Diego: {sd_df.shape[0]:,} rows √ó {sd_df.shape[1]} columns
   - Combined: {sf_df.shape[0] + sd_df.shape[0]:,} rows

üîç Key Findings:
   1. Both datasets have {sf_df.shape[1]} columns with matching structure
   2. Price column needs cleaning (stored as string with '$' and ',')
   3. {len(sf_missing_df)} columns in SF and {len(sd_missing_df)} columns in SD have missing values
   4. Review scores have significant missing values (~30-40%)
   5. Text columns (description, host_about) need NLP processing (Member 2's task)
   6. Categorical encoding needed for property_type, room_type, neighbourhood

üìã Next Steps (T1.2 - Data Cleaning Pipeline):
   - Handle missing values
   - Clean price column (remove '$' and ',', convert to float)
   - Remove duplicates
   - Filter invalid entries
   - Document all cleaning decisions
""")

print("\n" + "="*80)

T1.1: INITIAL DATA EXPLORATION

1. DATASET OVERVIEW

üìä San Francisco Dataset:
   - Rows: 7,780
   - Columns: 79

üìä San Diego Dataset:
   - Rows: 13,162
   - Columns: 79

üìä Combined Dataset (if merged):
   - Total Rows: 20,942

‚úì Column names match: True

2. COLUMN STRUCTURE

Total Columns: 79

Column Names:
 1. id
 2. listing_url
 3. scrape_id
 4. last_scraped
 5. source
 6. name
 7. description
 8. neighborhood_overview
 9. picture_url
10. host_id
11. host_url
12. host_name
13. host_since
14. host_location
15. host_about
16. host_response_time
17. host_response_rate
18. host_acceptance_rate
19. host_is_superhost
20. host_thumbnail_url
21. host_picture_url
22. host_neighbourhood
23. host_listings_count
24. host_total_listings_count
25. host_verifications
26. host_has_profile_pic
27. host_identity_verified
28. neighbourhood
29. neighbourhood_cleansed
30. neighbourhood_group_cleansed
31. latitude
32. longitude
33. property_type
34. room_type
35. accommodates
36. bathrooms
37. 