In [None]:
import os
import pandas as pd

# Directory where the CSV files are saved
csv_files_directory = './'  # Change this to the directory where the CSV files are saved

# Get a list of all CSV files that end with '_pop_data.csv'
csv_files = [file for file in os.listdir(csv_files_directory) if file.endswith('_gdp_data.csv')]

# Initialize a list to store DataFrames for each country
country_dataframes = []

# Loop through the CSV files and read them into DataFrames
for csv_file in csv_files:
    country_df = pd.read_csv(csv_file)
    # print(os.path.splitext(csv_file)[0])
    # country_df = country_df.sort_values(by='Year', ascending=True)
    country_dataframes.append(country_df)


In [None]:
import os
import pandas as pd

def merge_csv_files(directory_path):
    # Initialize the merged DataFrame with None
    merged_df = None

    # Iterate through each file in the directory and read it into a DataFrame
    for filename in os.listdir(directory_path):
        if filename.endswith("_modified.csv"):
            file_path = os.path.join(directory_path, filename)
            print(file_path)
            df = pd.read_csv(file_path)

            # Perform the merge
            if merged_df is None:
                merged_df = df
            else:
                merged_df = pd.merge(merged_df, df, on='Year', how='inner')

    # Set the 'Year' column as the index
    if merged_df is not None:
        merged_df.set_index('Year', inplace=True)

        # Sort the DataFrame in ascending order based on the index 'Year'
        merged_df.sort_index(inplace=True)

    return merged_df

if __name__ == "__main__":
    # Provide the path to the directory containing the CSV files
    directory_path = "/Users/heliaa/University/Code"

    # Merge all the CSV files into a single DataFrame and sort it
    merged_df = merge_csv_files(directory_path)

    # Save the merged DataFrame to a new CSV file
    if merged_df is not None:
        merged_df.to_csv("merged_data.csv")
    else:
        print("No CSV files found for merging.")


In [None]:
df = pd.read_csv('merged_data.csv',index_col='Year')

In [None]:
correlation_matrix = df.corr()
correlation_matrix

In [None]:
# Calculate the correlation matrix for the entire dataset
global_correlation_matrix = df.corr()

In [None]:
correlation_pairs = correlation_matrix.unstack().sort_values(ascending=False)
top_5_corr_columns = correlation_pairs[correlation_pairs != 1.0][:5]

print("Top 5 correlated column pairs:")
print(top_5_corr_columns)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# We use 'reset_index' to convert the multi-index Series back to a DataFrame.
top_5_corr_matrix = global_correlation_matrix.loc[top_5_corr_columns.index.get_level_values(0),
                                                 top_5_corr_columns.index.get_level_values(1)]


# Step 6: Plot the correlation heatmap using seaborn
plt.figure(figsize=(4, 4))
sns.heatmap(top_5_corr_matrix, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Top 5 Columns')
plt.show()


In [None]:
# Step 5: Get the names of the top 5 correlated columns
top_5_corr_column_names = top_5_corr_columns.index.get_level_values(0).unique()


plt.figure(figsize=(8, 8))
for i, column in enumerate(top_5_corr_column_names, 1):
    plt.subplot(5, 1, i)
    plt.plot(df.index, df[column])
    plt.ylabel(column)
    plt.tight_layout()

plt.xlabel('Date')
plt.suptitle('Time Series Plots of Top 5 Correlated Columns')
plt.show()

In [None]:
# Filter the original DataFrame to include only data from the top 5 correlated countries
top_5_corr_data = df[top_5_corr_column_names]


# Save the filtered data to a new CSV file
top_5_corr_data.to_csv('top_5_correlated_countries_with_year.csv')

data = pd.read_csv('top_5_correlated_countries_with_year.csv')

X = data.drop(columns=['Year','thailand'])
y  = data['thailand']


year = data['Year'].astype(str)
data['Year'] = pd.to_datetime(year)
    # Set 'date' column as the index
data.set_index('Year', inplace=True)


data['Prediction']=data[['thailand']].shift(-6)

In [None]:
data

In [None]:
X = np.array(data.drop(['Prediction'],1))
X=X[:-6]
y = np.array(data['Prediction'])
y=y[:-6]

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

# Create and fit the LinearRegression model
model = LinearRegression()
model.fit(X, y)

# Get the coefficients (slopes) and the intercept
coefficients = model.coef_
intercept = model.intercept_

In [None]:
# Assuming you have already imported the necessary libraries, including numpy and the model.
forecast=np.array(data.drop(['Prediction'],1))[-6:]

pred=model.predict(forecast)

# Round the prediction values to integers (without decimal places)
pred = pred.astype(int)

# Creating a DataFrame for the forecast values with corresponding dates
forecast_dates = pd.date_range(start=data.index[-1], periods=len(pred)+1, freq='A')[-len(pred):]
forecast_df = pd.DataFrame({'thailand': pred}, index=forecast_dates)

# Merging the original DataFrame and the forecast DataFrame
merged_df = pd.concat([data, forecast_df])

merged_df.to_csv(f'final_modified1.csv')

In [None]:
# Calculate the absolute average correlation for each column
average_correlations = correlation_matrix.abs().mean()

# Get the top 5 columns with the highest average correlation
next_5_corr_columns = average_correlations.nlargest(5)

print("Next 5 correlated columns:")
print(next_5_corr_columns)

In [None]:
# Step 5: Get the names of the top 5 correlated columns
top_5_corr_column_names = next_5_corr_columns.index.get_level_values(0).unique()


plt.figure(figsize=(8, 8))
for i, column in enumerate(top_5_corr_column_names, 1):
    plt.subplot(5, 1, i)
    plt.plot(df.index, df[column])
    plt.ylabel(column)
    plt.tight_layout()

plt.xlabel('Date')
plt.suptitle('Time Series Plots of Top 5 Correlated Columns')
plt.show()

In [None]:
# Filter the original DataFrame to include only data from the top 5 correlated countries
top_5_corr_data = df[top_5_corr_column_names]


# Save the filtered data to a new CSV file
top_5_corr_data.to_csv('top_5_correlated_countries_with_year.csv')

data = pd.read_csv('top_5_correlated_countries_with_year.csv')

X = data.drop(columns=['Year','thailand'])
y  = data['thailand']


year = data['Year'].astype(str)
data['Year'] = pd.to_datetime(year)
    # Set 'date' column as the index
data.set_index('Year', inplace=True)


data['Prediction']=data[['thailand']].shift(-6)

In [None]:
X = np.array(data.drop(['Prediction'],1))
X=X[:-6]
y = np.array(data['Prediction'])
y=y[:-6]


# Initialize a list to store the MAE and MSE for each country
mae_results_rf = []
mse_results_rf = []
mae_results_lr = []
mse_results_lr = []


# Step 3: Initialize the StandardScaler
scaler = MinMaxScaler()

# Step 4: Standardize each row in the DataFrame
standardized_data = []
for _, row in X.iterrows():
    scaled_row = scaler.fit_transform(row.values.reshape(1, -1))
    standardized_data.append(scaled_row)



# Create and fit the LinearRegression model
model = LinearRegression()
model.fit(X, y)


# Assuming you have already imported the necessary libraries, including numpy and the model.
forecast=np.array(data.drop(['Prediction'],1))[-6:]

pred=model.predict(forecast)

# Round the prediction values to integers (without decimal places)
pred = pred.astype(int)

# Creating a DataFrame for the forecast values with corresponding dates
forecast_dates = pd.date_range(start=data.index[-1], periods=len(pred)+1, freq='A')[-len(pred):]
forecast_df = pd.DataFrame({'thailand': pred}, index=forecast_dates)

# Merging the original DataFrame and the forecast DataFrame
merged_df = pd.concat([data, forecast_df])

merged_df.to_csv(f'final_modified2.csv')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
#load the data
data = pd.read_csv('canada_gdp_data.csv')

# Sort the DataFrame in ascending order
data = data.sort_values(by=data.columns.tolist())

# Reset the index of the DataFrame
data.reset_index(drop=True, inplace=True)

#divide into train and validation set
train = data[:int(0.7*(len(data)))]
valid = data[int(0.7*(len(data))):]

#preprocessing (since arima takes univariate series as input)
train.drop('Year',axis=1,inplace=True)
valid.drop('Year',axis=1,inplace=True)
print(train.dtypes)

In [None]:
#plotting the data
train['GDP'].plot()
valid['GDP'].plot()
plt.show()