<a href="https://colab.research.google.com/github/MorganChidley/Advanced-Topics-Assignment/blob/main/Advanced_Topics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing Dataset and Displaying first few rows**

In [None]:
# Importing nescesary libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Loading dataset into a Pandas DataFrame
file_path = '/content/drive/MyDrive/austin_weather.csv'
df = pd.read_csv(file_path)

# Display the first few rows to understand the structure
df.head()

**Filtering Data and Addinng additional columns**

In [None]:
# Using required columns only, "Date and TempAvgF"
df = df[['Date', 'TempAvgF']]

# Converting the Date column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Extract the Year and Month Values
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

# Extract Day of the Year
df['DayOfYear'] = df['Date'].dt.dayofyear

# Filter data for the first year in the dataset (2014 because there were only 11 days of data for 2013)
df_first_year = df[df['Year'] == 2014].copy()

# Adding a Day Of Year Column
df_first_year['DayOfYear'] = df_first_year['Date'].dt.dayofyear

# Calculate 25-Day Mocing Average
df_first_year['TempAvgF_MA25'] = df_first_year['TempAvgF'].rolling(window=25, min_periods=1).mean()


# Display the first few rows of the processed data
df_first_year.head()


**Checking for Missing values and rows**

In [None]:
# Checking for missing rows
missing_rows = df[df.isna().any(axis=1)]
print(missing_rows)

# Checking for missing values
df.isnull().sum()

**Checking for Duplicates**

In [None]:
# Removinf duplicates
df_cleaned = df.drop_duplicates(subset=['Date', 'TempAvgF'], keep='first')

# Check the shape of the DataFrame before and after removing duplicates
print("Original shape:", df.shape)
print("Shape after removing duplicates:", df_cleaned.shape)

**Calculating Outliers**

In [None]:
# Calculate quantiles
Q1 = df['TempAvgF'].quantile(0.25)
Q3 = df['TempAvgF'].quantile(0.75)
IQR = Q3 - Q1

# Define upper and lower bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['TempAvgF'] < lower_bound) | (df['TempAvgF'] > upper_bound)]

# Print the outliers
print("Outliers:")
print(outliers)

**Plotting the Raw Data and 25-Day Moving Average**

In [None]:
# Plotting the Graph with Raw Data and 25-Dat Moving average
plt.figure(figsize=(12, 6))
plt.plot(df_first_year['DayOfYear'], df_first_year['TempAvgF'], label='Raw Data', alpha=0.5)
plt.plot(df_first_year['DayOfYear'], df_first_year['TempAvgF_MA25'], label='25-Day Moving Average', color='red')
plt.xlabel('Day of Year')
plt.ylabel('Average Temperature (F)')
plt.title('Average Temperature with 25-Day Moving Average')
plt.legend()



plt.grid(True)
plt.show()

**Plotting the Trendline using LinearRegression**

In [None]:
# Fit a Linear regression model
X = df_first_year[['DayOfYear']]
y = df_first_year['TempAvgF']

model = LinearRegression()
model.fit(X, y)

# Print the models parameters
print(f"Intercept: {model.intercept_}")
print(f"Coefficient: {model.coef_[0]}")

# Calculating trendline values
first_day = 1
middle_day = 183
last_day = 365

trendline_values = model.predict(np.array([[first_day], [middle_day], [last_day]]))

# Plotting the trendline on the graph
plt.figure(figsize=(12, 6))
plt.plot(df_first_year['DayOfYear'], df_first_year['TempAvgF'], label='Raw Data')
plt.plot(df_first_year['DayOfYear'], df_first_year['TempAvgF_MA25'], label='25-Day Moving Average', color='red')
plt.plot([first_day, middle_day, last_day], trendline_values, label='Trendline', color='green', linestyle='--')
plt.xlabel('Day of Year')
plt.ylabel('Average Temperature (F)')
plt.title('Average Temperature with Trendline')
plt.legend()
plt.grid(True)
plt.show()