This notebook is structured as follows:
1. Basics (Import libraries / Read in file)
2. Clean dataset
3. Optimise R^2
4. Statistical Analysis
5. Visualise results
6. Save results
7. Tipps and tricks

***BASICS***

In [3]:
# import all the relevant stuff in the beginning

# basics
import numpy as np

# work with dataframes
import pandas as pd
import geopandas as gpd

# statistical analysis
import esda
from esda.moran import Moran
from splot.esda import moran_scatterplot, lisa_cluster, plot_local_autocorrelation

import pysal as ps 
from pysal.lib import weights 
from libpysal.io import open as psopen 
import libpysal 

# data visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# linear regression
import statsmodels.api as sm
from statsmodels.api import OLS

from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from pandas.plotting import scatter_matrix

%matplotlib inline

In [None]:
# read in .csv file
file_name = 'data/Afghanistan.csv' # file path
df = pd.read_csv(file_name)
df.head()
df.tail()

In [None]:
# initial information csv
df.info()
df.describe()

In [None]:
# play around with parameters
df = pd.read_csv(file_name, index_col=['id']) # set personalised index to column 'id'
df = pd.read_csv(file_name, header=1) # first rows of dataset should be header
df = pd.read_csv(file_name, skiprows=1) # skips first 5 rows
df = pd.read_csv(file_name, skipfooter=1) # skips last row

In [None]:
#assign new column names
df = df.columns = ['Column name 1', 'Column name 2']

In [1]:
# read in .shp file
geo = gpd.read_file('data/IMD/lab04_imd.shp')

In [None]:
# initial information shp
geo.plot()
geo.dtypes

***CLEAN DATA***

In [None]:
# divide dataset
df = df[['Column 1', 'Column 2']]
# or
df = df.drop(['Column 1', 'Column 2'], axis = 1)

In [None]:
# delete rows 
df = df.drop(['Row 1', 'Row 2'])

In [None]:
df.set_index('column_name', inplace = True)

In [None]:
# remember to reset the index
df = df.reset_index()

In [None]:
# sort values
df.sort_values(by = ['column 1', 'column 2'], ascending = [False,True])

In [None]:
# aggregate data
df.groupby('year').agg({'rating' : ['count','min', 'max']})
# for each year in df, give the following attributes of rating: count, min , max

In [None]:
# merge datasets
pd.concat([df1, df2, df3])

In [None]:
# set empty values to NaN / 0
df.dropna()  # Drop rows with missing values
df.fillna(value=0, inplace=True)  # Replace NaNs with 0

***OPTIMISE R^2***

In [None]:
X_train
model_sm = sm.OLS(y_train, X_train).fit()
print(model_sm.summary())

In [None]:
X_train2 = X_train.drop(['col1', 'col2'], axis= 1)
model_sm2 = sm.OLS(y_train, X_train2).fit()
print(model_sm2.summary())

In [None]:
# iterate to find best r^2
X_train3 = X_train2.drop(['col3', 'col4'], axis= 1)
model_sm3 = sm.OLS(y_train, X_train3).fit()
print(model_sm3.summary())

***STATISTICAL ANALYSIS***

In [None]:
# perform linear regression
# using statsmodel package (`statsmodels` needs us to explicitly add the constant by using `sm.add_constant`.)

# 1. Import Libraries
import statsmodels.api as sm

# 2. load data
df = pd.read_csv('data.csv', sep='\t')  # Replace with your file path & tab-delimited

# 3. explore data
print(df.head())                        # Show the first few rows
print(df.describe())                    # Summary statistics
print(df.info())                        # Data types and non-null counts

# 4. Handle Missing Values
df = df.dropna()                        # Drop rows with missing values
#or
df.fillna(value=0, inplace=True)        # Replace NaNs with 0

# 5. Split Data into Training and Test Sets
X = df[['feature1', 'feature2']]        # Independent variables
y = df['target']                        # Dependent variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Using Statsmodels for LR
X_train = sm.add_constant(X_train)      # create the X matrix by appending a column of ones (constant) to x_train

# build the OLS model 
model_sm = sm.OLS(y_train, X_train)

# do the fit and save regression info in results_sm
results_sm = model_sm.fit()

X_test_sm = sm.add_constant(X_test)     # Predictions
y_pred_sm = model_sm.predict(X_test_sm)

# useful OLS info
import warnings
warnings.filterwarnings('ignore')
print(results_sm.summary())

# pull the beta parameters out from results_sm
beta0_sm = results_sm.params[0]
beta1_sm = results_sm.params[1]

# 6. Using Scikit-learn for LR (other option)
model = LinearRegression()              # Create the model

model.fit(X_train, y_train)             # Fit the model

y_pred = model.predict(X_test)          # Predictions

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 7. Visualize Results
# Scatter plot and best-fit line
plt.scatter(X_test['feature1'], y_test, color='blue', label='Actual')
plt.scatter(X_test['feature1'], y_pred, color='red', label='Predicted')
plt.xlabel('Feature 1')
plt.ylabel('Target Variable')
plt.title('Actual vs Predicted')
plt.legend()
plt.show()


In [None]:
# moran's i
# 1. Define spatial weights
w = weights.contiguity.Queen.from_dataframe(gdf)        # contiguity (queen or Rook for shared edges)

w = weights.distance.KNN.from_dataframe(gdf, k=5)       # distance (using k-nearest neighbors)

# 2. Select your variable of interest
y = gdf['target_variable']

# 3. Calculate Moran's I
moran = esda.Moran(y, w)

# 4. Print the result
print("Moran's I:", moran.I)
print("P-value:", moran.p_sim)
print("Expected I:", moran.EI)

""" Moran’s I: The observed Moran’s I statistic. Positive values indicate clustering of similar values, and negative values suggest dispersion.
P-value: The significance of the Moran’s I value. """

In [None]:
# 5. moran's plot
fig, ax = moran_scatterplot(moran)
plt.show()

***VISUALISE RESULTS***

In [None]:
# line diagram
lineplot = sns.lineplot(df['Column name'])

In [None]:
# bar plot horizontal
barh = sns.barplot(df['Column name'], orient = 'h')

In [None]:
# bar plot vertical
barv = sns.barplot(df['Column name'], orient = 'v')

In [None]:
# scatter plot
scatterplot = sns.scatterplot(df['Column name'])

In [None]:
# histogramm
histogram = sns.histplot(df['Column name'])
histogram = sns.displot(df['Column name'])
plt.xticks(rotation = -45, fontsize = 10) # Rotate labels by 45 degrees

In [None]:
# kernel density plot
kde = sns.kdeplot(df['Column name'], fill = True)
plt.xticks(rotation = -45, fontsize = 10) # optional

In [None]:
# box plot
scatterplot = sns.boxplot(df['Column name'])

In [None]:
#violin plot
violinplot = sns.violinplot(df['Column name'])

***SAVE RESULTS***

In [None]:
# save as csv
geo.to_csv('foldername/filename.csv', sep =',', index = False)

In [None]:
# save as image
plt.savefig('image.png')

In [None]:
# save as pdf

***TIPPS AND TRICKS***

In [None]:
# print information
print('Cell value of "' + str(column_name) + '" and "' + str(row_name) + '" is: ' + str(value))