### First model using scikit learn linear regression
This is a first attempt to train using the given data using a linear regression model
The data will be analyzed based on startdate, locations

#### Import libraries and define constants

In [19]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# from shapely.geometry import Point

# Import additional libraries
import os
import datetime as dt
from IPython.display import display
from math import radians, cos, sin, asin, sqrt


In [20]:
pd.set_option("display.max.columns", None)

In [21]:
# Define the filepath

data_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + '/data'

training_csv = data_dir + '/train_data.csv'
test_csv = data_dir + '/test_data.csv'

print(training_csv)
print(test_csv)

C:\Sam\wids\WiDS-datathon-2023/data/train_data.csv
C:\Sam\wids\WiDS-datathon-2023/data/test_data.csv


#### Define dataframes and add necessary features

In [22]:
# Load the training data set
training_data = pd.read_csv(training_csv)

# Load the test data set
test_data = pd.read_csv(test_csv)

In [None]:
# Get all column names in file
with open('training_columns.txt', 'w', encoding='utf-8') as f:
    for col in training_data.columns:
        f.write(col+'\n')

with open('test_columns.txt', 'w', encoding='utf-8') as f:
    for col in test_data.columns:
        f.write(col+'\n')

with open('training_data_info.txt', 'w', encoding='utf-8') as f:
    training_data.info(verbose=True, buf=f)

# display(training_data.describe())
# display(test_data.describe())

In [23]:
# Find the target column
target_column = training_data.columns.difference(test_data.columns)[0]
print(f'The target column for prediction is {target_column}')

The target column for prediction is contest-tmp2m-14d__tmp2m


In [24]:
# Find any column with empty/null values
print(f'Columns with null vaules in Training data are {training_data.columns[training_data.isnull().any()]}')
print(f'Columns with null vaules in Test data are {test_data.columns[test_data.isnull().any()]}')

Columns with null vaules in Training data are Index(['nmme0-tmp2m-34w__ccsm30', 'nmme-tmp2m-56w__ccsm3',
       'nmme-prate-34w__ccsm3', 'nmme0-prate-56w__ccsm30',
       'nmme0-prate-34w__ccsm30', 'nmme-prate-56w__ccsm3',
       'nmme-tmp2m-34w__ccsm3', 'ccsm30'],
      dtype='object')
Columns with null vaules in Test data are Index([], dtype='object')


In [28]:
# Typecaste startdate to datetime and add epoch for easier handling
training_data.startdate = pd.to_datetime(training_data.startdate, format='%m/%d/%y')
training_data['startdate_epoch'] = (training_data['startdate'] - dt.datetime(1970,1,1)).dt.total_seconds()

test_data.startdate = pd.to_datetime(test_data.startdate, format='%m/%d/%y')
test_data['startdate_epoch'] = (test_data['startdate'] - dt.datetime(1970,1,1)).dt.total_seconds()

In [None]:
# Get current precision of latitude and longitude
loc_data = training_data[['lat','lon']]
precision = loc_data.applymap(lambda x: len(str(x).split('.')[1]))

print(f'Current precision of latitude in training data is {precision.lat.max()}')
print(f'Current precision of longitude in training data is {precision.lon.max()}')

loc_data = test_data[['lat','lon']]
precision = loc_data.applymap(lambda x: len(str(x).split('.')[1]))

print(f'Current precision of latitude in test data is {precision.lat.max()}')
print(f'Current precision of longitude in test data is {precision.lon.max()}')


In [29]:
# Add standard 15 digit decimal places precision to latitude and longitude
training_data['lat'] = training_data['lat'].round(15)
training_data['lon'] = training_data['lon'].round(15)
test_data['lat'] = test_data['lat'].round(15)
test_data['lon'] = test_data['lon'].round(15)

In [30]:
# Need to combine the latitude and longitude for easier data handling
# 'Single-point' Haversine: Calculates the great circle distance between a point on Earth and the (0, 0) lat-long coordinate

def single_pt_haversine(lat, lon, degrees=True):
    
    r = 6371 # Earth's radius (km)

    # Convert decimal degrees to radians
    if degrees:
        lat, lon = map(radians, [lat, lon])

    # 'Single-point' Haversine formula
    a = sin(lat/2)**2 + cos(lat) * sin(lon/2)**2
    d = 2 * r * asin(sqrt(a)) 

    return d

In [31]:
# Combine latitude and longitude to generate unique geolocations
# training_data['location'] = training_data.apply(lambda x: Point(x['lon'], x['lat']), axis=1)
# training_data['location']= training_data[['lat','lon']].values.tolist()

training_data['haversine_distance'] = [single_pt_haversine(x, y) for x, y in zip(training_data.lat, training_data.lon)]
print(f'There are {training_data.haversine_distance.nunique()} unique locations in training data')

There are 514 unique locations in training data


In [32]:
test_data['haversine_distance'] = [single_pt_haversine(x, y) for x, y in zip(test_data.lat, test_data.lon)]
print(f'There are {test_data.haversine_distance.nunique()} unique locations in test data')

There are 514 unique locations in test data


In [33]:
# Get unique locations by combining test and training data and grouping by latitude and longitude
combined_data = pd.concat([training_data,test_data], axis=0)
combined_data['haversine_distance'] = [single_pt_haversine(x, y) for x, y in zip(combined_data.lat, combined_data.lon)]
print(f'There are {combined_data.haversine_distance.nunique()} unique locations in combined data')

There are 532 unique locations in combined data


In [56]:
# Split combined data into training and test dataframe with location
training_data = combined_data.iloc[:len(training_data)]
test_data = combined_data.iloc[len(training_data):]

In [57]:
# Create target data
target_data = training_data[target_column]

### Data Preparation

In [58]:
# Taking a backup of the original training dataframe
training_data_full = training_data
training_data_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375734 entries, 0 to 375733
Columns: 248 entries, index to haversine_distance
dtypes: datetime64[ns](1), float64(245), int64(1), object(1)
memory usage: 713.8+ MB


In [59]:
# Listing the important columns
column_names = ['haversine_distance', 'startdate', 'startdate_epoch', 'lat', 'lon', 'climateregions__climateregion', 'elevation__elevation', 'contest-wind-h500-14d__wind-hgt-500', 'contest-slp-14d__slp', 'contest-pres-sfc-gauss-14d__pres', 'contest-pevpr-sfc-gauss-14d__pevpr', 'contest-precip-14d__precip', 'contest-prwtr-eatm-14d__prwtr', 'contest-rhum-sig995-14d__rhum', 'contest-wind-uwnd-250-14d__wind-uwnd-250', 'contest-wind-uwnd-925-14d__wind-uwnd-925', 'contest-wind-vwnd-250-14d__wind-vwnd-250', 'contest-wind-vwnd-925-14d__wind-vwnd-925', 'mei__mei', 'mei__meirank', 'mei__nip', 'mjo1d__amplitude', 'mjo1d__phase', 'sst-2010-1', 'sst-2010-2', 'sst-2010-3', 'sst-2010-4', 'sst-2010-5', 'sst-2010-6', 'sst-2010-7', 'sst-2010-8', 'sst-2010-9', 'sst-2010-10','contest-tmp2m-14d__tmp2m']

# Drop unnecessary columns
temp_data = training_data.drop(training_data.columns.difference(column_names),axis= 1)
training_data = temp_data.loc[:,column_names]
training_data.head()

Unnamed: 0,haversine_distance,startdate,startdate_epoch,lat,lon,climateregions__climateregion,elevation__elevation,contest-wind-h500-14d__wind-hgt-500,contest-slp-14d__slp,contest-pres-sfc-gauss-14d__pres,contest-pevpr-sfc-gauss-14d__pevpr,contest-precip-14d__precip,contest-prwtr-eatm-14d__prwtr,contest-rhum-sig995-14d__rhum,contest-wind-uwnd-250-14d__wind-uwnd-250,contest-wind-uwnd-925-14d__wind-uwnd-925,contest-wind-vwnd-250-14d__wind-vwnd-250,contest-wind-vwnd-925-14d__wind-vwnd-925,mei__mei,mei__meirank,mei__nip,mjo1d__amplitude,mjo1d__phase,sst-2010-1,sst-2010-2,sst-2010-3,sst-2010-4,sst-2010-5,sst-2010-6,sst-2010-7,sst-2010-8,sst-2010-9,sst-2010-10,contest-tmp2m-14d__tmp2m
0,92.662439,2014-09-01,1409530000.0,0.0,0.833333,BSh,200.0,5899.66,101352.08,98644.97,237.0,94.31,42.45,81.72,-2.56,-5.22,-3.52,4.41,0.961,56.0,4.0,1.23,4.0,352.2,-22.37,-19.69,13.58,19.29,-12.78,-25.2,7.55,-33.72,23.53,28.74448
1,92.662439,2014-09-02,1409616000.0,0.0,0.833333,BSh,200.0,5901.03,101396.02,98686.8,228.9,100.85,42.66,82.56,-2.39,-5.2,-4.49,3.74,0.961,56.0,4.0,1.53,4.0,350.96,-21.58,-20.66,12.14,19.55,-13.34,-25.84,6.36,-34.63,22.98,28.370585
2,92.662439,2014-09-03,1409702000.0,0.0,0.833333,BSh,200.0,5902.18,101429.25,98712.85,220.69,101.25,43.23,83.29,-2.76,-5.0,-5.44,3.4,0.961,56.0,4.0,1.46,4.0,349.86,-20.77,-21.34,10.97,19.5,-13.59,-26.26,5.42,-35.04,22.54,28.133059
3,92.662439,2014-09-04,1409789000.0,0.0,0.833333,BSh,200.0,5903.07,101440.85,98711.7,225.28,101.9,43.11,83.26,-3.0,-4.61,-5.76,3.29,0.961,56.0,4.0,1.51,4.0,348.91,-20.01,-21.92,9.78,19.24,-13.75,-26.48,4.58,-35.28,22.25,28.256798
4,92.662439,2014-09-05,1409875000.0,0.0,0.833333,BSh,200.0,5903.36,101419.53,98686.46,237.24,82.95,42.98,82.5,-3.4,-4.25,-6.09,3.27,0.961,56.0,4.0,1.51,4.0,348.03,-19.25,-22.54,8.51,18.92,-13.85,-26.78,3.79,-35.41,22.23,28.372353


In [50]:
# t_data = training_data[['startdate_epoch','climateregions__climateregion']]

In [60]:
climate_regions = training_data.climateregions__climateregion.unique()

region_values = list()
x = 0
for i in climate_regions:
    region_values.append(x)
    x = x+1

region_dict = dict(zip(climate_regions, region_values))
print(region_dict)
    
training_data = training_data.replace({'climateregions__climateregion': region_dict})
training_data.tail()

{'BSh': 0, 'Cfa': 1, 'BSk': 2, 'BWk': 3, 'BWh': 4, 'Csa': 5, 'Csb': 6, 'Cfb': 7, 'Dfb': 8, 'Dsc': 9, 'Dfc': 10, 'Dfa': 11, 'Dsb': 12, 'Dwa': 13, 'Dwb': 14}


Unnamed: 0,haversine_distance,startdate,startdate_epoch,lat,lon,climateregions__climateregion,elevation__elevation,contest-wind-h500-14d__wind-hgt-500,contest-slp-14d__slp,contest-pres-sfc-gauss-14d__pres,contest-pevpr-sfc-gauss-14d__pevpr,contest-precip-14d__precip,contest-prwtr-eatm-14d__prwtr,contest-rhum-sig995-14d__rhum,contest-wind-uwnd-250-14d__wind-uwnd-250,contest-wind-uwnd-925-14d__wind-uwnd-925,contest-wind-vwnd-250-14d__wind-vwnd-250,contest-wind-vwnd-925-14d__wind-vwnd-925,mei__mei,mei__meirank,mei__nip,mjo1d__amplitude,mjo1d__phase,sst-2010-1,sst-2010-2,sst-2010-3,sst-2010-4,sst-2010-5,sst-2010-6,sst-2010-7,sst-2010-8,sst-2010-9,sst-2010-10,contest-tmp2m-14d__tmp2m
375729,147.140547,2016-08-27,1472256000.0,1.0,0.866667,8,100.0,5763.23,101373.91,97613.96,312.05,57.45,24.32,70.25,23.2,1.32,9.59,2.96,0.186,37.0,3.0,1.08,6.0,345.63,-14.39,-20.19,45.87,12.14,-13.13,-24.3,10.74,-5.53,21.25,17.150954
375730,147.140547,2016-08-28,1472342000.0,1.0,0.866667,8,100.0,5760.19,101397.77,97631.29,305.82,53.53,23.92,71.08,23.18,1.4,7.14,2.3,0.186,37.0,3.0,0.84,6.0,346.04,-14.37,-19.84,45.63,12.5,-13.26,-25.0,9.52,-6.24,22.04,16.962051
375731,147.140547,2016-08-29,1472429000.0,1.0,0.866667,8,100.0,5754.76,101368.67,97588.69,311.62,52.12,23.94,69.74,24.49,1.75,7.05,2.19,0.186,37.0,3.0,0.51,7.0,346.25,-14.01,-19.49,45.18,12.69,-13.49,-25.53,8.29,-6.84,22.72,16.915474
375732,147.140547,2016-08-30,1472515000.0,1.0,0.866667,8,100.0,5742.21,101321.24,97538.62,304.54,51.73,23.61,69.71,25.8,1.84,7.74,1.88,0.186,37.0,3.0,0.56,8.0,346.13,-13.63,-19.17,44.31,12.77,-13.74,-25.9,6.9,-7.46,23.42,16.536761
375733,147.140547,2016-08-31,1472602000.0,1.0,0.866667,8,100.0,5726.45,101323.84,97536.84,295.29,51.83,23.41,71.66,26.62,2.07,8.29,1.66,0.186,37.0,3.0,0.95,8.0,345.8,-13.28,-19.03,43.01,12.75,-14.15,-26.06,5.23,-8.1,24.22,15.910995


In [64]:
training_data['startdate_epoch']= training_data['startdate_epoch'].astype(float)
training_data.drop(['startdate','lat','lon'],axis=1)

Unnamed: 0,haversine_distance,startdate_epoch,climateregions__climateregion,elevation__elevation,contest-wind-h500-14d__wind-hgt-500,contest-slp-14d__slp,contest-pres-sfc-gauss-14d__pres,contest-pevpr-sfc-gauss-14d__pevpr,contest-precip-14d__precip,contest-prwtr-eatm-14d__prwtr,contest-rhum-sig995-14d__rhum,contest-wind-uwnd-250-14d__wind-uwnd-250,contest-wind-uwnd-925-14d__wind-uwnd-925,contest-wind-vwnd-250-14d__wind-vwnd-250,contest-wind-vwnd-925-14d__wind-vwnd-925,mei__mei,mei__meirank,mei__nip,mjo1d__amplitude,mjo1d__phase,sst-2010-1,sst-2010-2,sst-2010-3,sst-2010-4,sst-2010-5,sst-2010-6,sst-2010-7,sst-2010-8,sst-2010-9,sst-2010-10,contest-tmp2m-14d__tmp2m
0,92.662439,1.409530e+09,0,200.0,5899.66,101352.08,98644.97,237.00,94.31,42.45,81.72,-2.56,-5.22,-3.52,4.41,0.961,56.0,4.0,1.23,4.0,352.20,-22.37,-19.69,13.58,19.29,-12.78,-25.20,7.55,-33.72,23.53,28.744480
1,92.662439,1.409616e+09,0,200.0,5901.03,101396.02,98686.80,228.90,100.85,42.66,82.56,-2.39,-5.20,-4.49,3.74,0.961,56.0,4.0,1.53,4.0,350.96,-21.58,-20.66,12.14,19.55,-13.34,-25.84,6.36,-34.63,22.98,28.370585
2,92.662439,1.409702e+09,0,200.0,5902.18,101429.25,98712.85,220.69,101.25,43.23,83.29,-2.76,-5.00,-5.44,3.40,0.961,56.0,4.0,1.46,4.0,349.86,-20.77,-21.34,10.97,19.50,-13.59,-26.26,5.42,-35.04,22.54,28.133059
3,92.662439,1.409789e+09,0,200.0,5903.07,101440.85,98711.70,225.28,101.90,43.11,83.26,-3.00,-4.61,-5.76,3.29,0.961,56.0,4.0,1.51,4.0,348.91,-20.01,-21.92,9.78,19.24,-13.75,-26.48,4.58,-35.28,22.25,28.256798
4,92.662439,1.409875e+09,0,200.0,5903.36,101419.53,98686.46,237.24,82.95,42.98,82.50,-3.40,-4.25,-6.09,3.27,0.961,56.0,4.0,1.51,4.0,348.03,-19.25,-22.54,8.51,18.92,-13.85,-26.78,3.79,-35.41,22.23,28.372353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375729,147.140547,1.472256e+09,8,100.0,5763.23,101373.91,97613.96,312.05,57.45,24.32,70.25,23.20,1.32,9.59,2.96,0.186,37.0,3.0,1.08,6.0,345.63,-14.39,-20.19,45.87,12.14,-13.13,-24.30,10.74,-5.53,21.25,17.150954
375730,147.140547,1.472342e+09,8,100.0,5760.19,101397.77,97631.29,305.82,53.53,23.92,71.08,23.18,1.40,7.14,2.30,0.186,37.0,3.0,0.84,6.0,346.04,-14.37,-19.84,45.63,12.50,-13.26,-25.00,9.52,-6.24,22.04,16.962051
375731,147.140547,1.472429e+09,8,100.0,5754.76,101368.67,97588.69,311.62,52.12,23.94,69.74,24.49,1.75,7.05,2.19,0.186,37.0,3.0,0.51,7.0,346.25,-14.01,-19.49,45.18,12.69,-13.49,-25.53,8.29,-6.84,22.72,16.915474
375732,147.140547,1.472515e+09,8,100.0,5742.21,101321.24,97538.62,304.54,51.73,23.61,69.71,25.80,1.84,7.74,1.88,0.186,37.0,3.0,0.56,8.0,346.13,-13.63,-19.17,44.31,12.77,-13.74,-25.90,6.90,-7.46,23.42,16.536761


In [65]:
print(training_data['startdate_epoch'].dtypes)

float64


### Dummy data training codes

In [66]:
# Train the model
model = LinearRegression().fit(training_data, target_data)

TypeError: The DType <class 'numpy.dtype[datetime64]'> could not be promoted by <class 'numpy.dtype[float64]'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtype[float64]'>, <class 'numpy.dtype[datetime64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>)

In [None]:
# Predict the target values for the test data
predictions = model.predict(test_data)

In [None]:
# Calculate the mean squared error between the predicted and actual target values
mse = mean_squared_error(target_data, predictions)
print("Mean Squared Error:", mse)