### First model using scikit learn linear regression
This is a first attempt to train using the given data using a linear regression model
The data will be analyzed based on startdate, locations

#### Import libraries and define constants

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# from shapely.geometry import Point

# Import additional libraries
import os
from IPython.display import display
from math import radians, cos, sin, asin, sqrt


In [None]:
pd.set_option("display.max.columns", None)

In [None]:
# Define the filepath

data_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + '/data'

training_csv = data_dir + '/train_data.csv'
test_csv = data_dir + '/test_data.csv'

print(training_csv)
print(test_csv)

#### Define dataframes and add necessary features

In [None]:
# Load the training data set
training_data = pd.read_csv(training_csv)

# Load the test data set
test_data = pd.read_csv(test_csv)

In [None]:
# Get all column names in file
with open('training_columns.txt', 'w', encoding='utf-8') as f:
    for col in training_data.columns:
        f.write(col+'\n')

with open('test_columns.txt', 'w', encoding='utf-8') as f:
    for col in test_data.columns:
        f.write(col+'\n')

with open('training_data_info.txt', 'w', encoding='utf-8') as f:
    training_data.info(verbose=True, buf=f)

# display(training_data.describe())
# display(test_data.describe())

In [None]:
# Find the target column
target_column = training_data.columns.difference(test_data.columns)[0]
print(f'The target column for prediction is {target_column}')

In [None]:
# Find any column with empty/null values
print(f'Columns with null vaules in Training data are {training_data.columns[training_data.isnull().any()]}')
print(f'Columns with null vaules in Test data are {test_data.columns[test_data.isnull().any()]}')

In [None]:
# Typecaste startdate to datetime for easier handling
training_data.startdate = pd.to_datetime(training_data.startdate, format='%m/%d/%y').strftime('%s')
test_data.startdate = pd.to_datetime(test_data.startdate, format='%m/%d/%y').strftime('%s')

In [None]:
# Get current precision of latitude and longitude
loc_data = training_data[['lat','lon']]
precision = loc_data.applymap(lambda x: len(str(x).split('.')[1]))

print(f'Current precision of latitude in training data is {precision.lat.max()}')
print(f'Current precision of longitude in training data is {precision.lon.max()}')

loc_data = test_data[['lat','lon']]
precision = loc_data.applymap(lambda x: len(str(x).split('.')[1]))

print(f'Current precision of latitude in test data is {precision.lat.max()}')
print(f'Current precision of longitude in test data is {precision.lon.max()}')


In [None]:
# Add standard 15 digit decimal places precision to latitude and longitude
training_data['lat'] = training_data['lat'].round(15)
training_data['lon'] = training_data['lon'].round(15)
test_data['lat'] = test_data['lat'].round(15)
test_data['lon'] = test_data['lon'].round(15)

In [None]:
# Need to combine the latitude and longitude for easier data handling
# 'Single-point' Haversine: Calculates the great circle distance between a point on Earth and the (0, 0) lat-long coordinate

def single_pt_haversine(lat, lon, degrees=True):
    
    r = 6371 # Earth's radius (km)

    # Convert decimal degrees to radians
    if degrees:
        lat, lon = map(radians, [lat, lon])

    # 'Single-point' Haversine formula
    a = sin(lat/2)**2 + cos(lat) * sin(lon/2)**2
    d = 2 * r * asin(sqrt(a)) 

    return d

In [None]:
# Combine latitude and longitude to generate unique geolocations
# training_data['location'] = training_data.apply(lambda x: Point(x['lon'], x['lat']), axis=1)
# training_data['location']= training_data[['lat','lon']].values.tolist()

training_data['haversine_distance'] = [single_pt_haversine(x, y) for x, y in zip(training_data.lat, training_data.lon)]
print(f'There are {training_data.haversine_distance.nunique()} unique locations in training data')

In [None]:
test_data['haversine_distance'] = [single_pt_haversine(x, y) for x, y in zip(test_data.lat, test_data.lon)]
print(f'There are {test_data.haversine_distance.nunique()} unique locations in test data')

In [None]:
# Get unique locations by combining test and training data and grouping by latitude and longitude
combined_data = pd.concat([training_data,test_data], axis=0)
combined_data['haversine_distance'] = [single_pt_haversine(x, y) for x, y in zip(combined_data.lat, combined_data.lon)]
print(f'There are {combined_data.haversine_distance.nunique()} unique locations in combined data')

In [None]:
# Split combined data into training and test dataframe with location
training_data = combined_data.iloc[:len(training_data)]
test_data = combined_data.iloc[len(training_data):]

In [None]:
# Create target data
target_data = training_data[target_column]

#### Data wrangling starts here

In [None]:
# Taking a backup of the original training dataframe
training_data_full = training_data
training_data_full.info()

In [None]:
# Listing the important columns
column_names = ['haversine_distance', 'startdate', 'lat', 'lon', 'climateregions__climateregion', 'elevation__elevation', 'contest-wind-h500-14d__wind-hgt-500', 'contest-slp-14d__slp', 'contest-pres-sfc-gauss-14d__pres', 'contest-pevpr-sfc-gauss-14d__pevpr', 'contest-precip-14d__precip', 'contest-prwtr-eatm-14d__prwtr', 'contest-rhum-sig995-14d__rhum', 'contest-wind-uwnd-250-14d__wind-uwnd-250', 'contest-wind-uwnd-925-14d__wind-uwnd-925', 'contest-wind-vwnd-250-14d__wind-vwnd-250', 'contest-wind-vwnd-925-14d__wind-vwnd-925', 'mei__mei', 'mei__meirank', 'mei__nip', 'mjo1d__amplitude', 'mjo1d__phase', 'sst-2010-1', 'sst-2010-2', 'sst-2010-3', 'sst-2010-4', 'sst-2010-5', 'sst-2010-6', 'sst-2010-7', 'sst-2010-8', 'sst-2010-9', 'sst-2010-10','contest-tmp2m-14d__tmp2m']

# Drop unnecessary columns
temp_data = training_data.drop(training_data.columns.difference(column_names),axis= 1)
training_data = temp_data.loc[:,column_names]
training_data.head()

In [None]:
training_data['climateregions__climateregion']= training_data['climateregions__climateregion'].astype(str)
training_data['startdate']= training_data['startdate'].astype(str)



### Dummy data training codes

In [None]:
# Train the model
model = LinearRegression().fit(training_data, target_data)

In [None]:
# Predict the target values for the test data
predictions = model.predict(test_data)

In [None]:
# Calculate the mean squared error between the predicted and actual target values
mse = mean_squared_error(target_data, predictions)
print("Mean Squared Error:", mse)