# Module 7: Machine Learning

## Author Jason A. Ballard  

## [JBtallgrass GitHub Repo](https://github.com/JBtallgrass/datafun-07-ml)

In [None]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Part 1: Chart a straight line

In [None]:
c =lambda f: 5/9 *(f-32)

temps =[(f, c(f)) for f in range(0, 101, 10)]

temps_df = pd.DataFrame(temps, columns=['Fahrenheit' , 'Celsius'])
axes = temps_df.plot(x='Fahrenheit', y = 'Celsius', style='.-')
y_label = axes.set_ylabel('Celsius')

# Save the figure
plt.title(f'Celsius to Fahrenheit relationship')
plt.savefig('figures/temp_line_chart.png')  # Saves to the file as .png to figures directory
# Show all plots
plt.show()


## Part 2: Predict Avg High Temp in NYC in January

### Section 1: Data Acquisition

In [None]:
nyc_df = pd.read_csv('data/ave_hi_nyc_jan_1895-2018.csv')
nyc_df.head()

### Section 2: Data Inspection

In [None]:
nyc_df.tail()

In [None]:
nyc_df.columns =['Date', 'Temperature', 'Anomaly']
nyc_df.head()

In [None]:
nyc_df.Date.dtype

In [None]:
nyc_df.Date = nyc_df.Date.floordiv(100)
nyc_df.head(3)


### Section 4: Descriptive Statistics

In [None]:
pd.set_option('display.precision',2)

nyc_df.Temperature.describe()



### Section 5: Build the Model

In [None]:
linear_regression =stats.linregress(x=nyc_df.Date, y=nyc_df.Temperature)
linear_regression.slope


In [None]:
linear_regression.intercept

### Section 6: Predict

In [None]:
linear_regression.slope * 2024 + linear_regression.intercept

### Section 7: Data Visualization

In [None]:
# Set the style to 'darkgrid'
sns.set_style('darkgrid')

# Create the regression plot
axes = sns.regplot(x='Date', y='Temperature', data=nyc_df, color='orange')
axes.set_ylim(10, 70)

# Add title and labels for clarity
axes.set_title('Temperature Over Time')
axes.set_xlabel('Date')
axes.set_ylabel('Temperature')

# Optionally, if you have a lot of dates on the x-axis and they overlap
plt.xticks(rotation=45)

# Save the figure
plt.savefig('figures/regression_plot.png', dpi=300)  # Adjust dpi for higher resolution if needed

# Display the plot
plt.show()

In [None]:
# Create a residual plot to show differences between observed and predicted temperatures
sns.residplot(x='Date', y='Temperature', data=nyc_df, color='orange')

plt.title('Residuals of Temperature Regression\n difference between Observed and Predicted')
plt.xlabel('Date')
plt.ylabel('Residuals')

# Save the residual plot
plt.savefig('figures/temperature_residuals.png', dpi=300)

plt.show()


## Part 3: Predict the Average High Temperature in January in NYC

### Section 1: Build the Model (Split training and testing)

In [None]:
X_train, X_test, y_train, y_test =train_test_split(nyc_df.Date.values.reshape(-1,1),nyc_df.Temperature.values, random_state=11)

X_train.shape

In [None]:
X_test.shape

In [None]:
linear_regression = LinearRegression()

linear_regression.fit(X=X_train, y=y_train)

In [None]:
linear_regression.coef_

In [None]:
linear_regression.intercept_

### Section 2: Test the Model

In [None]:
predicted = linear_regression.predict(X_test)

expected = y_test 

for p, e in zip(predicted[::5], expected[::5]):
    print(f'predicted: {p:.2f}, expected: {e:.2f}')

### Section 3: Predict 

In [None]:
predict = (lambda x: linear_regression.coef_* x + linear_regression.intercept_)

predict (2024)

### Section 4: Data Visualizations

In [None]:
axes = sns.scatterplot(data=nyc_df, x = 'Date', y = 'Temperature', hue='Temperature', palette='winter', legend=False)

# Adjust for better layout if necessary, especially if you have date overlaps
plt.xticks(rotation=45)
plt.tight_layout()
plt.title('Temperature Predictions for NYC \n January (2024)')

# Save the figure
plt.savefig('figures/nyc_temperature_scatterplot.png', dpi=300)  # Saves the plot as a PNG file with 300 DPI

# Show the plot
plt.show()


## Part 4:  Analysis

### Key Takeaways from the Module 7 project

1. **Gradual Warming Trend with Profound Implications:** The slight upward trend in temperatures over the 124-year period not only confirms the reality of global warming but also signals long-term environmental, ecological, and social implications. This underscores the urgency of acknowledging and addressing the subtle yet significant shifts in climate patterns.

2. **Challenges in Climate Forecasting:** The discrepancies between predictions made by linear regression models and actual observed temperatures highlight the inherent difficulties in accurately modeling and forecasting climate phenomena. This points to the complexity of the climate system and the limitations of current predictive tools in capturing this complexity.

3. **Need for Advanced Predictive Models:** The analysis consistently stresses the importance of methodological advancements and the development of more sophisticated models for climate prediction. Such advancements are critical for improving the accuracy of climate projections, enabling better preparedness and response strategies for climate change mitigation and adaptation.

### Model comparison

1. I couldn't tell a differnce other than the amount of typing (Anti cut pasta) in a programming class due counting reps.
2. The machine-learning in python (chapter 15) presented tighter predctions that may have better confidence intervals.
3. I am not an expert and therefore I am excited to continue learning.
4. I am amzed at the amount of variation you can play with in the programming and DataViz applications.


## Part 5 Bonus
"California Housing Dataset"

### Section 1: Fetch Data

In [None]:
from sklearn.datasets import fetch_california_housing

california = fetch_california_housing()

### Section 2: Display Dataset Description

In [None]:
print(california.DECR)

In [None]:
california.data.shape

In [None]:
california.target.shape

In [None]:
california.feature_names

### Section 3: Explore the Dataset

In [None]:
pd.set_option('precision_display', 4)

pd.set_option('max_columns', 9)

pd.set_option('display.width', None)

In [None]:
california_df = pd.DataFrame(california.data, columns=california.feature_names)

california_df['MedHouseValue'] = pd.Series(california.target)

In [None]:
california_df.head()



In [None]:
california_df.tail()

In [None]:
california_df.describe()

### Section 4: Data Visualization

In [None]:
# california_df is already loaded
sample_df = california_df.sample(frac=0.1, random_state=17)

sns.set(font_scale=2)
sns.set_style('darkgrid')

# california.feature_names contains the names of the features to plot
for feature in california.feature_names:
    plt.figure(figsize=(16, 9))
    sns.scatterplot(
        data=sample_df,
        x=feature,
        y='MedHouseValue',
        hue='MedHouseValue',
        palette='coolwarm',  # desired color palette
        legend=False
    )
    
    # Save each figure with a unique name based on the feature
    plt.savefig(f'figures/{feature}_vs_MedHouseValue.png', dpi=300)
    
    # Show the plot
    plt.show()
    
    # Close the figure after displaying and saving to free up memory
    plt.close()


### Section 5: Build the Model

In [None]:
X_train, X_test, y_train, y_test =train_test_split(california.data, california.target, random_state=11)

X_train.shape

In [None]:
X_test.shape

### Section 6a: Train the Model

### Section 6b: Testing the Model

### Section 7: Comparison of Prediction Vs. Expected Prices

### Section 8: Regression Metrics

### Section 9: Choosing the Best Model