In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

In [9]:
# Load the dataset with proper encoding
print("Loading air quality data...")
try:
    df = pd.read_csv('data.csv', encoding='utf-8')
except UnicodeDecodeError:
    print("UTF-8 encoding failed, trying latin-1...")
    try:
        df = pd.read_csv('data.csv', encoding='latin-1')
    except UnicodeDecodeError:
        print("Latin-1 encoding failed, trying cp1252...")
        df = pd.read_csv('data.csv', encoding='cp1252')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Size: {df.size:,} data points")

Loading air quality data...
UTF-8 encoding failed, trying latin-1...
Dataset loaded successfully!
Shape: (435742, 13)
Size: 5,664,646 data points


In [10]:
# Basic dataset information
print("=== DATASET OVERVIEW ===")
print(f"Columns: {list(df.columns)}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()

=== DATASET OVERVIEW ===
Columns: ['stn_code', 'sampling_date', 'state', 'location', 'agency', 'type', 'so2', 'no2', 'rspm', 'spm', 'location_monitoring_station', 'pm2_5', 'date']

Data types:
stn_code                        object
sampling_date                   object
state                           object
location                        object
agency                          object
type                            object
so2                            float64
no2                            float64
rspm                           float64
spm                            float64
location_monitoring_station     object
pm2_5                          float64
date                            object
dtype: object

First few rows:


Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date
0,150.0,February - M021990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",4.8,17.4,,,,,1990-02-01
1,151.0,February - M021990,Andhra Pradesh,Hyderabad,,Industrial Area,3.1,7.0,,,,,1990-02-01
2,152.0,February - M021990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",6.2,28.5,,,,,1990-02-01
3,150.0,March - M031990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",6.3,14.7,,,,,1990-03-01
4,151.0,March - M031990,Andhra Pradesh,Hyderabad,,Industrial Area,4.7,7.5,,,,,1990-03-01


In [None]:
# Statistical summary of numerical columns
print("=== STATISTICAL SUMMARY ===")
numerical_cols = ['so2', 'no2', 'rspm', 'spm', 'pm2_5']
print("Numerical columns:", numerical_cols)
print("\nStatistical Summary:")
df[numerical_cols].describe()