In [1]:
import pandas as pd

# List of districts
districts = ['amritsar', 'barnala', 'bathinda', 'faridkot', 'fatehgarhsahib', 'fazilka', 'ferozepur', 'gurdaspur', 'hoshiarpur', 'jalandhar', 'kapurthala', 'ludhiana', 'mansa', 'moga', 'pathankot', 'patiala', 'rupnagar', 'sahibzadaajitsinghnagar(mohali)', 'nawanshahr', 'sangrur', 'shrmukatsarsahib', 'tarntaran']

# Initialize an empty list to store all weather dataframes
weather_dfs = []

# Loop through each district
for district in districts:
    # Load weather data for the current district
    district_weather_data = pd.read_csv(f'./Weather/{district}/{district}_average_weather.csv')
    
    # Add the district column to the dataframe
    district_weather_data.insert(0, 'District', district)
    
    # Append the dataframe to the list
    weather_dfs.append(district_weather_data)

# Concatenate all weather dataframes into a single dataframe
all_weather_data = pd.concat(weather_dfs)

# Save the concatenated dataframe to a CSV file
all_weather_data.to_csv('all_districts_weather_data.csv', index=False)


In [10]:
import pandas as pd
all_weather_data = pd.read_csv("../train_data/final_merged_data.csv")
data = all_weather_data
all_weather_data.describe()
all_weather_data.columns

Index(['District', 'Year', 'Month', 'Avg_Temp', 'Avg_Feelslike', 'Avg_Dew',
       'Avg_Humidity', 'Avg_Precipitation', 'Avg_Precipitation_Probability',
       'Avg_Precipitation_Coverage', 'Avg_Snowfall', 'Avg_Snow_Depth',
       'Avg_Wind_Gust', 'Avg_Wind_Speed', 'Avg_Wind_Direction', 'Avg_Pressure',
       'Avg_Cloud_Cover', 'Avg_Visibility', 'Avg_Solar_Radiation',
       'Avg_Solar_Energy', 'Avg_UV_Index', 'Cases'],
      dtype='object')

In [16]:
# Calculate Pearson correlation coefficient manually
def pearson_correlation(x, y):
    n = len(x)
    sum_x = sum(x)
    sum_y = sum(y)
    sum_x_sq = sum(xi * xi for xi in x)
    sum_y_sq = sum(yi * yi for yi in y)
    sum_xy = sum(xi * yi for xi, yi in zip(x, y))
    numerator = n * sum_xy - sum_x * sum_y
    denominator = ((n * sum_x_sq - sum_x ** 2) * (n * sum_y_sq - sum_y ** 2)) ** 0.5
    if denominator == 0:
        return 0
    else:
        return numerator / denominator

# Columns to exclude from correlation analysis
exclude_columns = ['District','Month', 'Year']

# Calculate correlation between Cases and other variables, excluding specified columns
correlations = {}
y_column = 'Cases'
for column in data:
    if column != y_column and column not in exclude_columns:
        try:
            # Try converting column values to float
            column_values = [float(value) for value in data[column]]
            correlation = pearson_correlation(column_values, data[y_column])
            correlations[column] = correlation
        except ValueError:
            pass

# Filter out columns with low correlation with 'Cases'
threshold = 0.1  # Adjust as needed
low_correlation_columns = [column for column, correlation in correlations.items() if abs(correlation) < threshold]

# Print columns with low correlation
print("Columns with low correlation with 'Cases':", low_correlation_columns)

Columns with low correlation with 'Cases': ['Avg_Humidity', 'Avg_Snowfall', 'Avg_Snow_Depth', 'Avg_Wind_Gust', 'Avg_Wind_Speed', 'Avg_Solar_Radiation', 'Avg_Solar_Energy', 'Avg_UV_Index']


In [2]:
import pandas as pd

# Load merged weather data
merged_weather_data = pd.read_csv('./all_districts_weather_data.csv')

# Round weather data to 2 decimal places
merged_weather_data = merged_weather_data.round(2)

# Load cases data
cases_data = pd.read_csv('modified_file.csv')

# Reshape cases data to have 'Year' as a single column
cases_data = cases_data.melt(id_vars=['District', 'Month'], var_name='Year', value_name='Cases')

# Convert 'Year' column in cases_data to int64
cases_data['Year'] = cases_data['Year'].astype(int)

# Merge cases data with merged weather data based on District, Month, and Year
final_merged_data = pd.merge(merged_weather_data, cases_data, on=['District', 'Month', 'Year'])

# Save the final merged data to a CSV file
final_merged_data.to_csv('final_merged_data.csv', index=False)


In [3]:
print(final_merged_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1008 entries, 0 to 1007
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   District                       1008 non-null   object 
 1   Year                           1008 non-null   int64  
 2   Month                          1008 non-null   int64  
 3   Avg_Temp                       1008 non-null   float64
 4   Avg_Feelslike                  1008 non-null   float64
 5   Avg_Dew                        1008 non-null   float64
 6   Avg_Humidity                   1008 non-null   float64
 7   Avg_Precipitation              1008 non-null   float64
 8   Avg_Precipitation_Probability  1008 non-null   float64
 9   Avg_Precipitation_Coverage     1008 non-null   float64
 10  Avg_Snowfall                   1008 non-null   float64
 11  Avg_Snow_Depth                 1008 non-null   float64
 12  Avg_Wind_Gust                  1008 non-null   f

In [27]:
threshold = 0.1  # Adjust as needed
low_correlation_columns = [column for column, correlation in correlations.items() if abs(correlation) < threshold]

# Add 'District', 'Cases', 'Year', and 'Month' to the list of columns to be removed
columns_to_remove = low_correlation_columns + ['District', 'Cases', 'Year', 'Month']

# Create DataFrame
print(columns_to_remove)
df_to_remove = pd.DataFrame(columns_to_remove)

# Print DataFrame
print(df_to_remove)

['Avg_Humidity', 'Avg_Snowfall', 'Avg_Snow_Depth', 'Avg_Wind_Gust', 'Avg_Wind_Speed', 'Avg_Solar_Radiation', 'Avg_Solar_Energy', 'Avg_UV_Index', 'District', 'Cases', 'Year', 'Month']
                      0
0          Avg_Humidity
1          Avg_Snowfall
2        Avg_Snow_Depth
3         Avg_Wind_Gust
4        Avg_Wind_Speed
5   Avg_Solar_Radiation
6      Avg_Solar_Energy
7          Avg_UV_Index
8              District
9                 Cases
10                 Year
11                Month


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Load your dataset
df = pd.read_csv('../train_data/final_merged_data.csv')

# Filter data for 'jalandhar' district
data = df[df['District'] == 'amritsar'];

# Specify columns to be removed
columns_to_remove = ['_id','Cases', 'District', 'Year', 'Month', 'Avg_Snowfall', 'Avg_Snow_Depth', 
                    'Avg_Wind_Gust', 'Avg_Wind_Speed', 'Avg_Solar_Radiation', 'Avg_Solar_Energy', 'Avg_UV_Index']

# Drop specified columns and columns with NaN values
X = data.drop(columns_to_remove, axis=1).dropna(axis=1)
y = data['Cases']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)

# Create a linear regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the testing data
y_pred = model.predict(X_test)

# Round the predicted values to nearest integers
y_pred_r = np.round(y_pred)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred_r)
r2 = r2_score(y_test, y_pred_r)
mae = mean_absolute_error(y_test, y_pred_r)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2) Score:", r2)
print("MAE Score:", mae)


Mean Squared Error (MSE): 2.2
R-squared (R2) Score: 0.0
MAE Score: 1.4
