In [None]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\Mahes\Downloads\Air_dataset.csv\Air_dataset.csv"
df = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
df

In [None]:
# First, let's identify the city with the highest average PM2.5 value
highest_pm25_city = df.groupby('City')['PM2.5'].mean().idxmax()

# Now, filter the dataset for the city with the highest PM2.5
city_data = df[df['City'] == highest_pm25_city]

# Generate a scatter plot for PM2.5 vs NO2 for this city
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.scatter(city_data['PM2.5'], city_data['NO2'], alpha=0.6)
plt.title(f'Scatter Plot of PM2.5 vs NO2 for {highest_pm25_city}', fontsize=14)
plt.xlabel('PM2.5', fontsize=12)
plt.ylabel('NO2', fontsize=12)
plt.grid(True)
plt.show()


In [None]:
# Identify the city with the lowest average PM2.5 value
lowest_pm25_city = df.groupby('City')['PM2.5'].mean().idxmin()

# Filter the dataset for the city with the lowest PM2.5
city_data_lowest = df[df['City'] == lowest_pm25_city]

# Generate a scatter plot for PM2.5 vs NO2 for this city
plt.figure(figsize=(10,6))
plt.scatter(city_data_lowest['PM2.5'], city_data_lowest['NO2'], alpha=0.6, color='green')
plt.title(f'Scatter Plot of PM2.5 vs NO2 for {lowest_pm25_city}', fontsize=14)
plt.xlabel('PM2.5', fontsize=12)
plt.ylabel('NO2', fontsize=12)
plt.grid(True)
plt.show()


In [None]:
import plotly.express as px

# Calculate the correlation between PM2.5 and NO2
correlation_pm25_no2 = df['PM2.5'].corr(df['NO2'])

# Create a scatter plot for PM2.5 vs NO2 with the correlation displayed in the title
fig = px.scatter(
    df,
    x="PM2.5",
    y="NO2",
    color="PM2.5",
    labels={"PM2.5": "PM2.5 Levels", "NO2": "NO2 Levels"},
    title=f"Scatter Plot of PM2.5 vs NO2 (Correlation: {correlation_pm25_no2:.2f})",
    width=800,
    height=600,
    hover_data=["City"]
)

# Add zooming and panning functionality
fig.update_layout(
    xaxis=dict(title='PM2.5 Levels', showgrid=True),
    yaxis=dict(title='NO2 Levels', showgrid=True),
    hovermode='closest'
)

# Show the interactive scatter plot
fig.show()


In [None]:
import plotly.express as px

# Create an interactive box plot for SO2 levels across different cities
fig = px.box(
    df,
    x="City",
    y="SO2",
    color="City",
    labels={"SO2": "SO2 Levels", "City": "City"},
    title="Box Plot of SO2 Levels Across Different Cities",
    width=900,
    height=600
)

# Add zooming and panning functionality
fig.update_layout(
    xaxis=dict(title='City', showgrid=True),
    yaxis=dict(title='SO2 Levels', showgrid=True),
    hovermode='closest'
)

# Display the interactive box plot
fig.show()


In [None]:
import plotly.express as px
import pandas as pd

# Reshape the dataset for a combined box plot
df_melted = df.melt(id_vars="City", value_vars=["PM2.5", "NO2", "SO2"], 
                    var_name="Pollutant", value_name="Level")

# Create the box plot
fig = px.box(
    df_melted,
    x="Pollutant",
    y="Level",
    color="Pollutant",
    labels={"Pollutant": "Pollutant Type", "Level": "Pollutant Levels"},
    title="Combined Box Plot for PM2.5, NO2, and SO2 vs Pollutant levels",
    width=900,
    height=600
)

# Add zooming and panning functionality
fig.update_layout(
    xaxis=dict(title='Pollutant Type', showgrid=True),
    yaxis=dict(title='Pollutant Levels', showgrid=True),
    hovermode='closest'
)

# Display the interactive combined box plot
fig.show()


In [None]:
import plotly.express as px
import pandas as pd

# Melt the DataFrame to have a long format for plotting multiple pollutants
df_melted = df.melt(id_vars=["City"], value_vars=["SO2", "PM2.5", "NO2", "CO", "O3", "NH3"],
                    var_name="Pollutant", value_name="Concentration")

# Create an interactive box plot for all pollutants across different cities
fig = px.box(
    df_melted,
    x="City",
    y="Concentration",
    color="Pollutant",
    labels={"Concentration": "Concentration Levels", "City": "City", "Pollutant": "Pollutant"},
    title="Combined Box Plot of Various Pollutants Across Different Cities",
    width=1000,
    height=600
)

# Add zooming and panning functionality
fig.update_layout(
    xaxis=dict(title='City', showgrid=True),
    yaxis=dict(title='Concentration Levels', showgrid=True),
    hovermode='closest'
)

# Display the interactive box plot
fig.show()


In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import math

# Define the gases to be included in the subplots
gases = ["PM2.5", "PM10", "NO", "NO2", "NOx", "NH3", "CO", "SO2", "O3", "Benzene", "Toluene", "Xylene"]

# Get unique cities in the dataset
cities = df['City'].unique()

# Dynamically adjust the number of rows and columns based on the number of cities
n_cities = len(cities)
n_cols = 5
n_rows = math.ceil(n_cities / n_cols)

# Create a dynamic grid of subplots
fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=cities[:n_cols * n_rows], vertical_spacing=0.05, horizontal_spacing=0.05)

# Initialize row and column counters
row, col = 1, 1

# Loop over each city and create subplots for each city
for city in cities[:n_cols * n_rows]:  # Limiting the cities to fit the grid
    city_data = df[df['City'] == city]
    
    for gas in gases:
        fig.add_trace(
            go.Box(y=city_data[gas], name=gas, boxmean=False),
            row=row,
            col=col
        )
    
    # Move to the next column and row in the grid
    col += 1
    if col > n_cols:
        col = 1
        row += 1

# Update layout for the entire figure
fig.update_layout(
    title="Box Plots for Gases Across Cities",
    height=1500,
    width=1200,
    showlegend=False
)

# Show the interactive plot
fig.show()


In [None]:
import plotly.express as px

# Melt the DataFrame to have a long format for plotting multiple gases
df_melted_all_gases = df.melt(id_vars=["City"], value_vars=["PM2.5", "PM10", "NO", "NO2", "NOx", "NH3", "CO", "SO2", "O3", "Benzene", "Toluene", "Xylene"],
                              var_name="Gas", value_name="Concentration")

# Create an interactive box plot for all gases
fig = px.box(
    df_melted_all_gases,
    x="Gas",
    y="Concentration",
    color="Gas",
    labels={"Concentration": "Concentration Levels", "Gas": "Gas"},
    title="Box Plot of Concentration Levels for All Gases",
    width=1000,
    height=600
)

# Add zooming and panning functionality
fig.update_layout(
    xaxis=dict(title='Gas', showgrid=True),
    yaxis=dict(title='Concentration Levels', showgrid=True),
    hovermode='closest'
)

# Show the interactive box plot
fig.show()


In [None]:
import plotly.express as px
import pandas as pd

# Convert the 'Date' column to datetime format and extract the year
df['Year'] = pd.to_datetime(df['Date']).dt.year

# Create a histogram for PM2.5 vs years
fig = px.histogram(
    df,
    x="Year",
    y="PM2.5",
    histfunc="avg",
    color="Year",
    labels={"Year": "Year", "PM2.5": "Average PM2.5 Levels"},
    title="Histogram of Average PM2.5 Levels by Year",
    width=1000,
    height=600
)

# Add zooming and panning functionality
fig.update_layout(
    xaxis=dict(title='Year', showgrid=True),
    yaxis=dict(title='Average PM2.5 Levels', showgrid=True),
    hovermode='closest'
)

# Show the interactive histogram
fig.show()


In [None]:
import plotly.express as px

# Create a histogram for PM2.5 vs cities
fig = px.histogram(
    df,
    x="City",
    y="PM2.5",
    histfunc="avg",
    color="City",
    labels={"City": "City", "PM2.5": "Average PM2.5 Levels"},
    title="Histogram of Average PM2.5 Levels by City",
    width=1000,
    height=600
)

# Add zooming and panning functionality
fig.update_layout(
    xaxis=dict(title='City', showgrid=True),
    yaxis=dict(title='Average PM2.5 Levels', showgrid=True),
    hovermode='closest'
)

# Show the interactive histogram
fig.show()


In [None]:
import numpy as np
import scipy.stats as stats
import plotly.graph_objs as go

# Sort the NO2 values
no2_data = df['NO2'].dropna()

# Generate a QQ plot using scipy
qq = stats.probplot(no2_data, dist="norm")

# Extract theoretical quantiles and sample quantiles
theoretical_quantiles = qq[0][0]
sample_quantiles = qq[0][1]

# Create the QQ plot with Plotly
fig = go.Figure()

# Add trace for the QQ plot points
fig.add_trace(go.Scatter(x=theoretical_quantiles, y=sample_quantiles, mode='markers', name='Data'))

# Add a line y=x for reference
fig.add_trace(go.Scatter(x=theoretical_quantiles, y=theoretical_quantiles, mode='lines', name='Reference Line'))

# Update layout for better visualization
fig.update_layout(
    title="QQ Plot for NO2",
    xaxis_title="Theoretical Quantiles",
    yaxis_title="Sample Quantiles",
    width=800,
    height=600
)

# Show the interactive QQ plot
fig.show()


In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# List of gases to plot
gases = ["PM2.5", "PM10", "NO", "NO2", "NOx", "NH3", "CO", "SO2", "O3", "Benzene", "Toluene", "Xylene"]

# Filter data for the last 5 years
years_to_include = df['Year'].unique()[-5:]  # Last 5 unique years

# Define color for each gas
colors = {
    "PM2.5": "blue",
    "PM10": "green",
    "NO": "red",
    "NO2": "purple",
    "NOx": "orange",
    "NH3": "brown",
    "CO": "pink",
    "SO2": "cyan",
    "O3": "magenta",
    "Benzene": "yellow",
    "Toluene": "lime",
    "Xylene": "teal"
}

# Create a 5x5 grid of subplots
fig = make_subplots(rows=5, cols=5, subplot_titles=gases, vertical_spacing=0.05, horizontal_spacing=0.05)

# Initialize row and column counters
row, col = 1, 1

# Loop through each gas and add a histogram for each gas
for gas in gases:
    hist_data = df[df['Year'].isin(years_to_include)]

    # Add trace for each gas in the grid
    fig.add_trace(
        go.Histogram(
            x=hist_data['Year'],
            y=hist_data[gas],
            histfunc='avg',
            marker_color=colors[gas],
            name=gas
        ),
        row=row,
        col=col
    )
    
    # Move to the next column and row in the grid
    col += 1
    if col > 5:
        col = 1
        row += 1

# Update layout for the entire figure
fig.update_layout(
    title="Histograms of Gases over Last 5 Years",
    height=1500,
    width=1200,
    showlegend=False,
    bargap=0.1
)

# Show the interactive plot
fig.show()


In [None]:
import plotly.express as px

# Calculate the correlation between NO2 and O3
correlation_no2_o3 = df['NO2'].corr(df['O3'])

# Create a scatter plot for NO2 vs O3 with the correlation displayed in the title
fig = px.scatter(
    df,
    x="NO2",
    y="O3",
    color="NO2",
    labels={"NO2": "NO2 Levels", "O3": "O3 Levels"},
    title=f"Scatter Plot of NO2 vs O3 (Correlation: {correlation_no2_o3:.2f})",
    width=800,
    height=600,
    hover_data=["City"]
)

# Add zooming and panning functionality
fig.update_layout(
    xaxis=dict(title='NO2 Levels', showgrid=True),
    yaxis=dict(title='O3 Levels', showgrid=True),
    hovermode='closest'
)

# Show the interactive scatter plot
fig.show()


In [None]:
import pandas as pd

# List of gases
gases = ["PM2.5", "PM10", "NO", "NO2", "NOx", "NH3", "CO", "SO2", "O3", "Benzene", "Toluene", "Xylene"]

# Calculate Mean, Median, Standard Deviation, and Proportion of missing values
mean_values = df[gases].mean()
median_values = df[gases].median()
std_dev_values = df[gases].std()
proportion_missing = df[gases].isnull().sum() / len(df)

# Combine the results into a DataFrame for better presentation
parameters_df = pd.DataFrame({
    'Mean': mean_values,
    'Median': median_values,
    'Standard Deviation': std_dev_values,
    'Proportion Missing': proportion_missing
})

# Display the DataFrame
print(parameters_df)


In [None]:
import plotly.express as px

# Calculate the correlation between CO and AQI
correlation_co_aqi = df['CO'].corr(df['AQI'])

# Create a scatter plot for CO vs AQI with the correlation displayed in the title
fig = px.scatter(
    df,
    x="CO",
    y="AQI",
    color="CO",
    labels={"CO": "CO Levels", "AQI": "AQI Levels"},
    title=f"Scatter Plot of CO vs AQI (Correlation: {correlation_co_aqi:.2f})",
    width=800,
    height=600,
    hover_data=["City"]
)

# Add zooming and panning functionality
fig.update_layout(
    xaxis=dict(title='CO Levels', showgrid=True),
    yaxis=dict(title='AQI Levels', showgrid=True),
    hovermode='closest'
)

# Show the interactive scatter plot
fig.show()


In [None]:
import plotly.express as px

# Calculate the correlation between SO2 and O3
correlation = df['SO2'].corr(df['O3'])

# Create a scatter plot for SO2 vs O3 using Plotly
fig = px.scatter(
    df,
    x="SO2",
    y="O3",
    color="SO2",
    labels={"SO2": "SO2 Levels", "O3": "O3 Levels"},
    title=f"Scatter Plot of SO2 vs O3 (Correlation: {correlation:.2f})",
    width=800,
    height=600,
    hover_data=["City"]
)

# Add zooming and panning functionality
fig.update_layout(
    xaxis=dict(title='SO2 Levels', showgrid=True),
    yaxis=dict(title='O3 Levels', showgrid=True),
    hovermode='closest'
)

# Show the interactive scatter plot
fig.show()


In [None]:
import pandas as pd

# List of gases to analyze
gases = ["PM2.5", "PM10", "NO", "NO2", "NOx", "NH3", "CO", "SO2", "O3", "Benzene", "Toluene", "Xylene"]

# Calculate the correlation matrix for the gases
correlation_matrix = df[gases].corr()

# Display the correlation matrix
print(correlation_matrix)


In [None]:
import numpy as np
import scipy.stats as stats
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# List of gases for which we need QQ plots
gases = ["SO2", "O3", "PM2.5", "CO", "NO2"]

# Create a 2x3 grid of subplots
fig = make_subplots(rows=2, cols=3, subplot_titles=gases)

# Initialize row and column counters
row, col = 1, 1

# Loop through each gas and create QQ plot
for gas in gases:
    gas_data = df[gas].dropna()
    
    # Generate QQ plot using scipy
    qq = stats.probplot(gas_data, dist="norm")
    
    # Extract theoretical quantiles and sample quantiles
    theoretical_quantiles = qq[0][0]
    sample_quantiles = qq[0][1]
    
    # Add the QQ plot for the current gas to the figure
    fig.add_trace(
        go.Scatter(x=theoretical_quantiles, y=sample_quantiles, mode='markers', name=gas),
        row=row,
        col=col
    )
    
    # Add reference line (y = x)
    fig.add_trace(
        go.Scatter(x=theoretical_quantiles, y=theoretical_quantiles, mode='lines', name='Reference Line'),
        row=row,
        col=col
    )
    
    # Move to the next column or row in the grid
    col += 1
    if col > 3:
        col = 1
        row += 1

# Update layout for better visualization
fig.update_layout(
    title="QQ Plots for SO2, O3, PM2.5, CO, and NO2",
    height=800,
    width=1200,
    showlegend=False
)

# Show the interactive QQ plot
fig.show()


In [None]:
import plotly.graph_objs as go

# Convert the 'Date' column to datetime if not already done
df['Date'] = pd.to_datetime(df['Date'])

# List of gases
gases = ["PM2.5", "PM10", "NO", "NO2", "NOx", "NH3", "CO", "SO2", "O3", "Benzene", "Toluene", "Xylene"]

# Create a figure
fig = go.Figure()

# Loop through each gas and add a line plot for its trend over time
for gas in gases:
    fig.add_trace(go.Scatter(
        x=df['Date'],
        y=df[gas],
        mode='lines',
        name=gas,
        line=dict(width=2)
    ))

# Update layout with titles and axis labels
fig.update_layout(
    title="Trends of All Gases Over Time",
    xaxis_title="Date",
    yaxis_title="Concentration",
    width=1000,
    height=600,
    hovermode='x unified'  # Show all values when hovering over a single point on the x-axis
)

# Show the plot
fig.show()


In [None]:
# Calculate the proportion for each AQI_Bucket category
proportion_aqi_bucket = df['AQI_Bucket'].value_counts(normalize=True)

# Display the proportion of each AQI category
print(proportion_aqi_bucket)


In [None]:
# Define the columns for which we need to calculate the proportion of non-missing values
gases = ["PM2.5", "SO2", "O3", "NO2", "CO"]

# Calculate the proportion of non-missing values for each gas
proportion_non_missing = df[gases].notnull().sum() / len(df)

# Display the proportion of non-missing values for the selected gases
print(proportion_non_missing)


In [None]:
# Assuming 'df' is the DataFrame containing the dataset

# Check which columns are numeric
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Fill missing values for only numeric columns with the column mean
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Import necessary libraries
from scipy import stats
import numpy as np

# Calculate mean, standard deviation, and proportion for PM2.5
mean_pm25 = np.mean(df['PM2.5'])
std_pm25 = np.std(df['PM2.5'])
proportion_pm25 = np.sum(df['PM2.5'] > 35) / len(df['PM2.5'])  # Example: Proportion of values above 35 µg/m3

# Print the results
print(f'Mean of PM2.5: {mean_pm25}')
print(f'Standard Deviation of PM2.5: {std_pm25}')
print(f'Proportion of PM2.5 above 35 µg/m3: {proportion_pm25}')


In [None]:
# Assuming 'df' is the DataFrame containing the dataset

# Check which columns are numeric
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Fill missing values for only numeric columns with the column mean
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Import necessary libraries
import numpy as np

# Function to calculate and display mean, standard deviation, and proportion for each gas
def calculate_statistics(gas, threshold):
    mean_gas = np.mean(df[gas])
    std_gas = np.std(df[gas])
    proportion_gas = np.sum(df[gas] > threshold) / len(df[gas])  # Example: Proportion of values above the threshold

    print(f'\nStatistics for {gas}:')
    print(f'Mean of {gas}: {mean_gas}')
    print(f'Standard Deviation of {gas}: {std_gas}')
    print(f'Proportion of {gas} above {threshold} µg/m3: {proportion_gas}')

# Calculate for PM2.5
calculate_statistics('PM2.5', 35)  # Example threshold of 35 µg/m³

# Calculate for SO2
calculate_statistics('SO2', 20)  # Example threshold of 20 µg/m³

# Calculate for O3
calculate_statistics('O3', 50)  # Example threshold of 50 µg/m³

# Calculate for CO
calculate_statistics('CO', 10)  # Example threshold of 10 µg/m³

# Calculate for NO2
calculate_statistics('NO2', 40)  # Example threshold of 40 µg/m³


In [None]:
import pandas as pd
import numpy as np

# Assuming 'df' is the DataFrame containing the dataset

# List of gases to analyze
gases = ['PM2.5','SO2', 'O3', 'CO', 'NO2']

# Create a list to store the results
results_list = []

# Calculate mean, standard deviation, and proportion for each gas
for gas in gases:
    mean_value = np.mean(df[gas])
    std_value = np.std(df[gas])
    proportion_value = np.sum(df[gas] > 35) / len(df[gas])  # Proportion of values above 35 µg/m³ (or other threshold)
    
    # Append the results to the list
    results_list.append({
        'Gas': gas,
        'Mean': mean_value,
        'Standard Deviation': std_value,
        'Proportion (>35 µg/m³)': proportion_value
    })

# Convert the list of dictionaries into a DataFrame
results_df = pd.DataFrame(results_list)

# Display the results in tabular form
print(results_df)


In [None]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# Assuming df is your DataFrame and contains a column 'SO2'
# Extract the SO2 column and remove missing values
so2_data = df['SO2'].dropna()

# Create the QQ plot for SO2
fig, ax = plt.subplots(figsize=(8, 6))
stats.probplot(so2_data, dist="norm", plot=ax)
ax.set_title('QQ Plot for SO2')
ax.set_ylabel('Observed SO2 Values')
ax.set_xlabel('Theoretical Quantiles')
plt.grid(True)
plt.show()


In [None]:
import plotly.graph_objs as go
import numpy as np
import scipy.stats as stats

# Assuming df is your DataFrame and contains a column 'SO2'
# Extract the SO2 column and remove missing values
so2_data = df['SO2'].dropna()

# Generate the theoretical quantiles and sample quantiles
qq = stats.probplot(so2_data, dist="norm")

# Extract the theoretical quantiles and the sample quantiles
theoretical_quantiles = qq[0][0]
sample_quantiles = qq[0][1]

# Create the interactive QQ plot using Plotly
fig = go.Figure()

# Add scatter plot for the QQ points
fig.add_trace(go.Scatter(x=theoretical_quantiles, y=sample_quantiles, mode='markers', name='SO2 Data'))

# Add the y=x line for reference
fig.add_trace(go.Scatter(x=theoretical_quantiles, y=theoretical_quantiles, mode='lines', name='Reference Line'))

# Update layout for the plot
fig.update_layout(
    title="Interactive QQ Plot for SO2",
    xaxis_title="Theoretical Quantiles",
    yaxis_title="Sample Quantiles",
    width=800,
    height=600,
    hovermode="closest"
)

# Show the plot
fig.show()


In [None]:
import plotly.graph_objs as go
import numpy as np
import scipy.stats as stats

# Assuming df is your DataFrame and contains a column 'NO2'
# Extract the NO2 data and remove missing values
no2_data = df['NO2'].dropna()

# Generate the theoretical quantiles and sample quantiles for NO2
qq_no2 = stats.probplot(no2_data, dist="norm")

# Extract the theoretical quantiles and sample quantiles
theoretical_quantiles_no2 = qq_no2[0][0]
sample_quantiles_no2 = qq_no2[0][1]

# Create an interactive QQ plot for NO2 using Plotly
fig = go.Figure()

# Add the scatter plot for the NO2 data points, using a different color
fig.add_trace(go.Scatter(x=theoretical_quantiles_no2, y=sample_quantiles_no2, mode='markers', name='NO2 Data', marker_color='magenta'))

# Add the reference line (y = x)
fig.add_trace(go.Scatter(x=theoretical_quantiles_no2, y=theoretical_quantiles_no2, mode='lines', name='Reference Line', line=dict(color='orange')))

# Update layout for better visualization
fig.update_layout(
    title="Interactive QQ Plot for NO2",
    xaxis_title="Theoretical Quantiles",
    yaxis_title="Sample Quantiles",
    width=800,
    height=600,
    hovermode="closest"
)

# Show the interactive plot
fig.show()


In [None]:
import plotly.graph_objs as go
import numpy as np
import scipy.stats as stats

# Assuming df is your DataFrame and contains a column 'PM2.5'
# Extract the PM2.5 data and remove missing values
pm25_data = df['PM2.5'].dropna()

# Generate the theoretical quantiles and sample quantiles for PM2.5
qq_pm25 = stats.probplot(pm25_data, dist="norm")

# Extract the theoretical quantiles and sample quantiles
theoretical_quantiles_pm25 = qq_pm25[0][0]
sample_quantiles_pm25 = qq_pm25[0][1]

# Create an interactive QQ plot for PM2.5 using Plotly
fig = go.Figure()

# Add the scatter plot for the PM2.5 data points, using a different color
fig.add_trace(go.Scatter(x=theoretical_quantiles_pm25, y=sample_quantiles_pm25, mode='markers', name='PM2.5 Data', marker_color='green'))

# Add the reference line (y = x)
fig.add_trace(go.Scatter(x=theoretical_quantiles_pm25, y=theoretical_quantiles_pm25, mode='lines', name='Reference Line', line=dict(color='blue')))

# Update layout for better visualization
fig.update_layout(
    title="Interactive QQ Plot for PM2.5",
    xaxis_title="Theoretical Quantiles",
    yaxis_title="Sample Quantiles",
    width=800,
    height=600,
    hovermode="closest"
)

# Show the interactive plot
fig.show()


In [None]:
import plotly.graph_objs as go
import numpy as np
import scipy.stats as stats

# Assuming df is your DataFrame and contains a column 'O3'
# Extract the O3 data and remove missing values
o3_data = df['O3'].dropna()

# Generate the theoretical quantiles and sample quantiles for O3
qq_o3 = stats.probplot(o3_data, dist="norm")

# Extract the theoretical quantiles and sample quantiles
theoretical_quantiles_o3 = qq_o3[0][0]
sample_quantiles_o3 = qq_o3[0][1]

# Create an interactive QQ plot for O3 using Plotly
fig = go.Figure()

# Add the scatter plot for the O3 data points, using a different color
fig.add_trace(go.Scatter(x=theoretical_quantiles_o3, y=sample_quantiles_o3, mode='markers', name='O3 Data', marker_color='purple'))

# Add the reference line (y = x)
fig.add_trace(go.Scatter(x=theoretical_quantiles_o3, y=theoretical_quantiles_o3, mode='lines', name='Reference Line', line=dict(color='red')))

# Update layout for better visualization
fig.update_layout(
    title="Interactive QQ Plot for O3",
    xaxis_title="Theoretical Quantiles",
    yaxis_title="Sample Quantiles",
    width=800,
    height=600,
    hovermode="closest"
)

# Show the interactive plot
fig.show()


In [None]:
import plotly.graph_objs as go
import numpy as np
import scipy.stats as stats

# Assuming df is your DataFrame and contains a column 'CO'
# Extract the CO data and remove missing values
co_data = df['CO'].dropna()

# Generate the theoretical quantiles and sample quantiles for CO
qq_co = stats.probplot(co_data, dist="norm")

# Extract the theoretical quantiles and sample quantiles
theoretical_quantiles_co = qq_co[0][0]
sample_quantiles_co = qq_co[0][1]

# Create an interactive QQ plot for CO using Plotly
fig = go.Figure()

# Add the scatter plot for the CO data points, using a different color
fig.add_trace(go.Scatter(x=theoretical_quantiles_co, y=sample_quantiles_co, mode='markers', name='CO Data', marker_color='orange'))

# Add the reference line (y = x)
fig.add_trace(go.Scatter(x=theoretical_quantiles_co, y=theoretical_quantiles_co, mode='lines', name='Reference Line', line=dict(color='blue')))

# Update layout for better visualization
fig.update_layout(
    title="Interactive QQ Plot for CO",
    xaxis_title="Theoretical Quantiles",
    yaxis_title="Sample Quantiles",
    width=800,
    height=600,
    hovermode="closest"
)

# Show the interactive plot
fig.show()


1. Randomly select 100 samples from your variables, design a proper hypothesis, perform the following analysis, and interpret your results. Claim (5+2.5 points for mean=C (your guess) and proportion=0.5), Hypothesis test (2.5+2.5 points), CI (2.5+2.5 points), Interpret (2.5+2.5 points) the proportion, and mean of your variable. (20 points)

In [None]:
import pandas as pd
import numpy as np

# Randomly select 100 samples from the PM2.5 column
pm25_samples = df['PM2.5'].dropna().sample(n=100, random_state=42)


In [None]:
from scipy import stats

# Define the hypothesized mean (C)
hypothesized_mean = 60

# Perform one-sample t-test
t_stat, p_value_mean = stats.ttest_1samp(pm25_samples, hypothesized_mean)

print(f"T-statistic for mean: {t_stat}")
print(f"P-value for mean: {p_value_mean}")


In [None]:
# Define threshold and calculate the actual proportion in the sample
threshold = 35
observed_proportion = np.sum(pm25_samples > threshold) / len(pm25_samples)

# Define hypothesized proportion
hypothesized_proportion = 0.5

# Calculate z-statistic
n = len(pm25_samples)
z_stat = (observed_proportion - hypothesized_proportion) / np.sqrt(hypothesized_proportion * (1 - hypothesized_proportion) / n)

# Calculate two-tailed p-value
p_value_proportion = 2 * (1 - stats.norm.cdf(abs(z_stat)))

print(f"Z-statistic for proportion: {z_stat}")
print(f"P-value for proportion: {p_value_proportion}")


In [None]:
# Calculate mean and standard error
sample_mean = pm25_samples.mean()
sample_std = pm25_samples.std()
confidence_level = 0.95
degrees_freedom = n - 1
confidence_interval_mean = stats.t.interval(confidence_level, degrees_freedom, sample_mean, sample_std/np.sqrt(n))

print(f"95% Confidence Interval for Mean: {confidence_interval_mean}")


In [None]:
# Calculate standard error for proportion
standard_error_proportion = np.sqrt(observed_proportion * (1 - observed_proportion) / n)

# Confidence interval for proportion
z_critical = stats.norm.ppf(0.975)  # for 95% confidence
confidence_interval_proportion = (observed_proportion - z_critical * standard_error_proportion,
                                  observed_proportion + z_critical * standard_error_proportion)

print(f"95% Confidence Interval for Proportion: {confidence_interval_proportion}")


2. Use samples from your other variables and compare two sample means and SD. Submit your results of the following hypothesis test (5+2.5 points), b. CI (2.5+2.5 points) for your variable, Interpret your results (2.5+2.5 points). 

In [None]:
# Randomly sample 100 observations from NO2 and SO2
no2_samples = df['NO2'].dropna().sample(n=100, random_state=42)
so2_samples = df['SO2'].dropna().sample(n=100, random_state=42)

# Calculate means and standard deviations for both samples
mean_no2 = no2_samples.mean()
std_no2 = no2_samples.std()
mean_so2 = so2_samples.mean()
std_so2 = so2_samples.std()

print(f"Mean of NO2: {mean_no2}, Standard Deviation of NO2: {std_no2}")
print(f"Mean of SO2: {mean_so2}, Standard Deviation of SO2: {std_so2}")


In [None]:
# Perform two-sample t-test
t_stat, p_value = stats.ttest_ind(no2_samples, so2_samples, equal_var=False)

print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")


In [None]:
# Calculate the difference in means
mean_difference = mean_no2 - mean_so2

# Standard error of the difference in means
se_difference = np.sqrt((std_no2**2 / len(no2_samples)) + (std_so2**2 / len(so2_samples)))

# Calculate the 95% confidence interval
confidence_level = 0.95
z_critical = stats.t.ppf((1 + confidence_level) / 2, min(len(no2_samples), len(so2_samples)) - 1)
ci_lower = mean_difference - z_critical * se_difference
ci_upper = mean_difference + z_critical * se_difference

print(f"95% Confidence Interval for the Difference in Means: ({ci_lower}, {ci_upper})")


3.Use samples from your other variables to find the correlation between your and the other two variables (10 points). Use the hypothesis test (5 points) and CI (5 points) to interpret your (5 points) results.

In [None]:
# Randomly select 100 samples from NO2, PM2.5, and CO columns
no2_samples = df['NO2'].dropna().sample(n=100, random_state=42)
pm25_samples = df['PM2.5'].dropna().sample(n=100, random_state=42)
co_samples = df['CO'].dropna().sample(n=100, random_state=42)


In [None]:
# Calculate correlations
correlation_no2_pm25 = np.corrcoef(no2_samples, pm25_samples)[0, 1]
correlation_no2_co = np.corrcoef(no2_samples, co_samples)[0, 1]

print(f"Correlation between NO2 and PM2.5: {correlation_no2_pm25}")
print(f"Correlation between NO2 and CO: {correlation_no2_co}")


In [None]:
from scipy.stats import pearsonr

# Perform hypothesis tests for correlations
correlation_stat_no2_pm25, p_value_no2_pm25 = pearsonr(no2_samples, pm25_samples)
correlation_stat_no2_co, p_value_no2_co = pearsonr(no2_samples, co_samples)

print(f"Correlation test for NO2 and PM2.5: Correlation = {correlation_stat_no2_pm25}, P-value = {p_value_no2_pm25}")
print(f"Correlation test for NO2 and CO: Correlation = {correlation_stat_no2_co}, P-value = {p_value_no2_co}")


In [None]:
import math

def fisher_z_confidence_interval(correlation, n, confidence_level=0.95):
    z = np.arctanh(correlation)
    z_se = 1 / math.sqrt(n - 3)
    z_critical = stats.norm.ppf((1 + confidence_level) / 2)
    
    z_lower = z - z_critical * z_se
    z_upper = z + z_critical * z_se
    
    lower_bound = np.tanh(z_lower)
    upper_bound = np.tanh(z_upper)
    return lower_bound, upper_bound

# Calculate 95% confidence intervals
ci_no2_pm25 = fisher_z_confidence_interval(correlation_no2_pm25, len(no2_samples))
ci_no2_co = fisher_z_confidence_interval(correlation_no2_co, len(no2_samples))

print(f"95% Confidence Interval for correlation between NO2 and PM2.5: {ci_no2_pm25}")
print(f"95% Confidence Interval for correlation between NO2 and CO: {ci_no2_co}")


In [None]:
# Fisher transformation for PM2.5 and NO2
z_pm25_no2 = np.arctanh(correlation_pm25_no2)
se_z = 1 / np.sqrt(n - 3)
z_critical = stats.norm.ppf(0.975)  # 95% CI

ci_z_pm25_no2 = (z_pm25_no2 - z_critical * se_z, z_pm25_no2 + z_critical * se_z)
ci_pm25_no2 = (np.tanh(ci_z_pm25_no2[0]), np.tanh(ci_z_pm25_no2[1]))

print(f"95% Confidence Interval for Correlation between PM2.5 and NO2: {ci_pm25_no2}")

# Fisher transformation for PM2.5 and SO2
z_pm25_so2 = np.arctanh(correlation_pm25_so2)
ci_z_pm25_so2 = (z_pm25_so2 - z_critical * se_z, z_pm25_so2 + z_critical * se_z)
ci_pm25_so2 = (np.tanh(ci_z_pm25_so2[0]), np.tanh(ci_z_pm25_so2[1]))

print(f"95% Confidence Interval for Correlation between PM2.5 and SO2: {ci_pm25_so2}")


4. Use your samples to find a linear regression equation for your variables and target variable (10 points) and plot the regression line (10 points) and residual (10 points). Is this a good model to predict your target variable? 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
 
# Randomly select 100 samples from each variable
pm25_samples = df['PM2.5'].dropna().sample(n=100, random_state=42).reset_index(drop=True)
no2_samples = df['NO2'].dropna().sample(n=100, random_state=42).reset_index(drop=True)
so2_samples = df['SO2'].dropna().sample(n=100, random_state=42).reset_index(drop=True)

In [None]:
# Combine the predictor variables into a DataFrame
X = pd.DataFrame({'NO2': no2_samples, 'SO2': so2_samples})
y = pm25_samples
 
# Fit a linear regression model
model = LinearRegression()
model.fit(X, y)
 
# Get the regression coefficients
intercept = model.intercept_
coefficients = model.coef_
 
print(f"Intercept: {intercept}")
print(f"Coefficients: {coefficients}")
 

In [None]:
# Predict PM2.5 using only NO2 values for visualization
predicted_pm25_no2 = intercept + coefficients[0] * no2_samples
 
plt.figure(figsize=(8, 6))
sns.scatterplot(x=no2_samples, y=pm25_samples, color='blue', label="Observed PM2.5")
plt.plot(no2_samples, predicted_pm25_no2, color='red', label="Fitted Line (NO2)")
plt.xlabel("NO2")
plt.ylabel("PM2.5")
plt.title("Regression Line for PM2.5 vs NO2")
plt.legend()
plt.show()

In [None]:
# Predict PM2.5 using the model (with both NO2 and SO2)
predicted_pm25 = model.predict(X)
 
# Calculate residuals
residuals = y - predicted_pm25
 
# Plot residuals
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color="purple")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Residuals of the Linear Regression Model")
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=predicted_pm25, y=residuals, color="purple")
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Predicted PM2.5")
plt.ylabel("Residuals")
plt.title("Residual Plot for Linear Regression Model")
plt.show()
 
 

In [None]:
# Calculate R-squared and Mean Squared Error
r_squared = r2_score(y, predicted_pm25)
mse = mean_squared_error(y, predicted_pm25)
 
print(f"R-squared: {r_squared}")
print(f"Mean Squared Error: {mse}")
 

6. Use your samples to find a multi-regression equation. (40 points). Finally, print your adjusted R2 (10 points). There should be three different models to predict the target variable. Which one of these models is the best to predict your target variable, and why (10 points)?

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assume df is the DataFrame and contains columns for PM2.5, NO2, SO2, CO, and O3
# Randomly select 100 samples from each variable
pm25_samples = df['PM2.5'].dropna().sample(n=100, random_state=42).reset_index(drop=True)
no2_samples = df['NO2'].dropna().sample(n=100, random_state=42).reset_index(drop=True)
so2_samples = df['SO2'].dropna().sample(n=100, random_state=42).reset_index(drop=True)
co_samples = df['CO'].dropna().sample(n=100, random_state=42).reset_index(drop=True)
o3_samples = df['O3'].dropna().sample(n=100, random_state=42).reset_index(drop=True)

# Define the target variable
y = pm25_samples


In [None]:
# Define predictors for Model 1
X_model1 = pd.DataFrame({'NO2': no2_samples, 'SO2': so2_samples})

# Fit the model
model1 = LinearRegression()
model1.fit(X_model1, y)

# Get the model coefficients
intercept_model1 = model1.intercept_
coefficients_model1 = model1.coef_

# Calculate R-squared and adjusted R-squared
r2_model1 = model1.score(X_model1, y)
adjusted_r2_model1 = 1 - (1 - r2_model1) * (len(y) - 1) / (len(y) - X_model1.shape[1] - 1)

print("Model 1 Equation: PM2.5 =", intercept_model1, "+", coefficients_model1[0], "* NO2 +", coefficients_model1[1], "* SO2")
print("Model 1 R-squared:", r2_model1)
print("Model 1 Adjusted R-squared:", adjusted_r2_model1)


In [None]:
# Define predictors for Model 2
X_model2 = pd.DataFrame({'NO2': no2_samples, 'SO2': so2_samples, 'CO': co_samples})

# Fit the model
model2 = LinearRegression()
model2.fit(X_model2, y)

# Get the model coefficients
intercept_model2 = model2.intercept_
coefficients_model2 = model2.coef_

# Calculate R-squared and adjusted R-squared
r2_model2 = model2.score(X_model2, y)
adjusted_r2_model2 = 1 - (1 - r2_model2) * (len(y) - 1) / (len(y) - X_model2.shape[1] - 1)

print("Model 2 Equation: PM2.5 =", intercept_model2, "+", coefficients_model2[0], "* NO2 +", coefficients_model2[1], "* SO2 +", coefficients_model2[2], "* CO")
print("Model 2 R-squared:", r2_model2)
print("Model 2 Adjusted R-squared:", adjusted_r2_model2)


In [None]:
# Define predictors for Model 3
X_model3 = pd.DataFrame({'NO2': no2_samples, 'SO2': so2_samples, 'CO': co_samples, 'O3': o3_samples})

# Fit the model
model3 = LinearRegression()
model3.fit(X_model3, y)

# Get the model coefficients
intercept_model3 = model3.intercept_
coefficients_model3 = model3.coef_

# Calculate R-squared and adjusted R-squared
r2_model3 = model3.score(X_model3, y)
adjusted_r2_model3 = 1 - (1 - r2_model3) * (len(y) - 1) / (len(y) - X_model3.shape[1] - 1)

print("Model 3 Equation: PM2.5 =", intercept_model3, "+", coefficients_model3[0], "* NO2 +", coefficients_model3[1], "* SO2 +", coefficients_model3[2], "* CO +", coefficients_model3[3], "* O3")
print("Model 3 R-squared:", r2_model3)
print("Model 3 Adjusted R-squared:", adjusted_r2_model3)
