### First model using scikit learn linear regression
This is a first attempt to train using the given data using a linear regression model
The data will be analyzed based on startdate, locations

#### Import libraries and define constants

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from shapely.geometry import Point

# Import additional libraries
import os
from IPython.display import display


In [None]:
# Define the filepath

data_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + '/data'

training_csv = data_dir + '/train_data.csv'
test_csv = data_dir + '/test_data.csv'

print(training_csv)
print(test_csv)

#### Define dataframes and add necessary features

In [None]:
# Load the training data set
training_data = pd.read_csv(training_csv)

# Load the test data set
test_data = pd.read_csv(test_csv)

In [None]:
# Get all column names in file
with open('training_columns.txt', 'w', encoding='utf-8') as f:
    for col in training_data.columns:
        f.write(col+'\n')

with open('test_columns.txt', 'w', encoding='utf-8') as f:
    for col in test_data.columns:
        f.write(col+'\n')

# display(training_data.describe())
# display(test_data.describe())

In [None]:
# Find the target column
target_column = training_data.columns.difference(test_data.columns)[0]
print(f'The target column for prediction is {target_column}')

In [None]:
training_data.describe()

In [None]:
# Typecaste startdate to datetime for easier handling
training_data.startdate = pd.to_datetime(training_data.startdate, format='%m/%d/%y')
test_data.startdate = pd.to_datetime(test_data.startdate, format='%m/%d/%y')

In [None]:
# Get current precision of latitude and longitude
loc_data = training_data[['lat','lon']]
precision = loc_data.applymap(lambda x: len(str(x).split('.')[1]))

print(f'Current precision of latitude in training data is {precision.lat.max()}')
print(f'Current precision of longitude in training data is {precision.lon.max()}')

loc_data = test_data[['lat','lon']]
precision = loc_data.applymap(lambda x: len(str(x).split('.')[1]))

print(f'Current precision of latitude in test data is {precision.lat.max()}')
print(f'Current precision of longitude in test data is {precision.lon.max()}')


In [None]:
# Add standard 15 digit decimal places precision to latitude and longitude
training_data['lat'] = training_data['lat'].round(15)
training_data['lon'] = training_data['lon'].round(15)
test_data['lat'] = training_data['lat'].round(15)
test_data['lon'] = training_data['lon'].round(15)

In [None]:
# Combine latitude and longitude to generate unique geolocations
training_data['location'] = training_data.apply(lambda x: Point(x['lon'], x['lat']), axis=1)
print(f'There are {training_data.location.nunique()} unique locations in training data')

test_data['location'] = test_data.apply(lambda x: Point(x['lon'], x['lat']), axis=1)
print(f'There are {test_data.location.nunique()} unique locations in test data')

In [None]:
# Get unique locations by combining test and training data and grouping by latitude and longitude
combined_data = pd.concat([training_data,test_data], axis=0)
combined_data['location'] = combined_data.apply(lambda x: Point(x['lon'], x['lat']), axis=1)
print(f'There are {combined_data.location.nunique()} unique locations in combined data')

In [None]:
# Split combined data into training and test dataframe with location
training_data = combined_data.iloc[:len(training_data)]
test_data = combined_data.iloc[len(training_data):]

In [None]:
# Create target data
target_data = training_data[target_column]

#### Data wrangling starts here

### Dummy data training codes

In [None]:
# Train the model
model = LinearRegression().fit(training_data, target_data)

In [None]:
# Predict the target values for the test data
predictions = model.predict(test_data)

In [None]:
# Calculate the mean squared error between the predicted and actual target values
mse = mean_squared_error(target_data, predictions)
print("Mean Squared Error:", mse)