In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Load air quality data
data = pd.read_csv('cpcb_dly_aq_tamil_nadu-2014.csv')
# Data Preprocessing
data['Sampling Date'] = pd.to_datetime(data['Sampling Date'])
tn_data = data[data['State'] == 'Tamil Nadu']
# Display basic statistics
print(data.describe())
# Check for missing values
print(data.isnull().sum())
# Check unique values in categorical columns
print(data['State'].unique())
print(data['City/Town/Village/Area'].unique())

In [None]:
# Plot trends in RSPM/PM10 pollution levels over time
plt.figure(figsize=(22, 12))
plt.plot(data.index, data['RSPM/PM10'], marker='o', linestyle='-', color='b', label='RSPM/PM10')
plt.xlabel('Date')
plt.ylabel('RSPM/PM10 Levels')
plt.title('Trends in RSPM/PM10 Pollution Levels Over Time')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Group data by date and calculate mean values
daily_mean = data.groupby('Sampling Date').mean()

In [None]:
# Plot daily average air quality
plt.figure(figsize=(12, 6))
plt.plot(daily_mean.index, daily_mean['SO2'], label='Mean SO2')
plt.plot(daily_mean.index, daily_mean['NO2'], label='Mean NO2')
plt.plot(daily_mean.index, daily_mean['RSPM/PM10'], label='Mean RSPM/PM10')
plt.xlabel('Sampling Date')
plt.ylabel('Mean Concentration')
plt.title('Daily Average Air Quality in Tamil Nadu')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Calculate daily average SO2 and NO2 concentrations
daily_mean = data.groupby('Sampling Date')[['SO2', 'NO2']].mean()

In [None]:
# Plot daily average SO2 and NO2 concentrations
plt.figure(figsize=(12, 6))
plt.plot(daily_mean.index, daily_mean['SO2'], label='Mean SO2 Concentration')
plt.plot(daily_mean.index, daily_mean['NO2'], label='Mean NO2 Concentration')
plt.xlabel('Sampling Date')
plt.ylabel('Mean Concentration (µg/m³)')
plt.title('Daily Average SO2 and NO2 Concentrations in Tamil Nadu')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Display statistics and box plots for SO2 and NO2 concentrations
so2_stats = data['SO2'].describe()
no2_stats = data['NO2'].describe()

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
data.boxplot(column='SO2')
plt.title('SO2 Concentration Box Plot')
plt.subplot(1, 2, 2)
data.boxplot(column='NO2')
plt.title('NO2 Concentration Box Plot')
plt.tight_layout()
plt.show()

In [None]:
# Visualize trends in air pollution over time using seaborn
plt.figure(figsize=(12, 6))
for pollutant in ['SO2', 'NO2', 'RSPM/PM10']:
    sns.lineplot(data=tn_data, x='Sampling Date', y=pollutant, label=pollutant)
plt.xlabel('Year')
plt.ylabel('Concentration (µg/m³)')
plt.title('Air Pollution Trends in Tamil Nadu')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Train a linear regression model to predict RSPM/PM10 levels
selected_columns = ['SO2', 'NO2', 'RSPM/PM10']
tn_data = tn_data[selected_columns].dropna()

In [None]:
X = tn_data[['SO2', 'NO2']]
y = tn_data['RSPM/PM10']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f'Mean Squared Error: {mse}')
print(f'R-squared (Coefficient of Determination): {r2}')

In [None]:
# User Input for Prediction
user_so2 = float(input("Enter SO2 concentration (µg/m³): "))
user_no2 = float(input("Enter NO2 concentration (µg/m³): "))

In [None]:
# Predict RSPM/PM10 level based on user input
user_input = pd.DataFrame({'SO2': [user_so2], 'NO2': [user_no2]})
predicted_rspm_pm10 = model.predict(user_input[['SO2', 'NO2']])

In [None]:
print(f'Predicted RSPM/PM10 Level: {predicted_rspm_pm10[0]:.2f}')

In [None]:
# Visualization of Actual vs. Predicted
plt.figure(figsize=(8, 8))
plt.scatter(X_test['NO2'], y_test, color='blue', label='Actual', alpha=0.5)
plt.scatter(user_input['NO2'], predicted_rspm_pm10, color='red', marker='X', label='User Input')
plt.xlabel('NO2 Concentration')
plt.ylabel('RSPM/PM10 Levels')
plt.title('Actual vs. Predicted RSPM/PM10 Levels')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Visualize pollutant levels by location and time using a heatmap
df = pd.read_csv('cpcb_dly_aq_tamil_nadu-2014.csv')
data_heatmap = df.pivot_table(index='Location of Monitoring Station', values='RSPM/PM10')

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(data_heatmap, cmap='YlGnBu', annot=True, fmt=".1f")
plt.xlabel('Time')
plt.ylabel('Location')
plt.title('Air Quality Heatmap')
plt.show()
# Calculate average pollution levels for each area
grouped = data.groupby('City/Town/Village/Area')[['SO2', 'NO2', 'RSPM/PM10']].mean()
print(grouped)