In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression 
import seaborn as sns
from datetime import datetime 

In [2]:
Weather_data = "/Users/hasnainashraf/Desktop/seattle-weather.csv"
df = pd.read_csv(Weather_data) 

In [3]:
#Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

In [4]:
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [5]:
#Extracting year, month, day from date
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

In [6]:
print("Missing values:\n", df.isnull().sum())

Missing values:
 date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
year             0
month            0
day              0
dtype: int64


In [7]:
#Create a season column
df['season'] = df['month'].apply(lambda x: 
      'Winter' if x in [12, 1, 2] else
      'Spring' if x in [3, 4, 5] else
      'Summer' if x in [6, 7, 8] else
      'Fall')

In [8]:
#Create a year-month combination feature to treat each month of each year separately
df['year_month'] = df['date'].dt.strftime('%Y-%m')

In [9]:
#Analysis 1: Regression to predict temp_max based on year-month and precipitation
print("\nRegression Analysis: Predicting maximum temperature")


Regression Analysis: Predicting maximum temperature


In [10]:
#dummy variables for year_month (each month of each year separately)
X = pd.get_dummies(df['year_month'], prefix='year_month', drop_first=True)
X['precipitation'] = df['precipitation']
y = df['temp_max']

In [11]:
#Fit the model
model = LinearRegression()
model.fit(X, y)

In [12]:
#Print results
print(f"R² score: {model.score(X, y):.4f}")
print(f"Number of features: {len(model.coef_)}")
print("Sample coefficients (first 10):")

R² score: 0.7937
Number of features: 48
Sample coefficients (first 10):


In [13]:
#DataFrame to display coefficients (showing only a subset of 10 for readability)
coef_df = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': model.coef_
    })
print(coef_df.head(10))  # Show only first 10 coefficients for readability
print(f"Intercept: {model.intercept_:.4f}")

              Feature  Coefficient
0  year_month_2012-02     2.120169
1  year_month_2012-03     2.513108
2  year_month_2012-04     7.679404
3  year_month_2012-05    10.442808
4  year_month_2012-06    11.509179
5  year_month_2012-07    15.652970
6  year_month_2012-08    18.569044
7  year_month_2012-09    15.592236
8  year_month_2012-10     8.770140
9  year_month_2012-11     4.331578
Intercept: 7.2890


In [None]:
"""Here we analyzed historical weather data from Seattle and used a linear regression model to examine
the relationship between maximum temperature (temp_max) with variables like month of the year and
precipitation. The model achieved an R² score of 0.79, indicating that approximately 79% 
of the variance in precipitation can be explained by temp_max.

This suggests a strong linear association within the dataset, possibly reflecting seasonal patterns 
where higher or lower temperatures coincide with specific precipitation levels. However, weather systems 
are inherently complex, and this correlation does not imply causation. Further analysis with more variables 
(like humidity, atmospheric pressure, or seasonality indicators) and nonlinear models could provide a deeper 
understanding."""