This notebook is structured as follows:
1. Basics (Import libraries / Read in file)
2. Clean dataset
3. Optimise R^2
4. Statistical Analysis
5. Visualise results
6. Save results
7. Tipps and tricks

***BASICS***

In [None]:
# import all the relevant stuff in the beginning

# basics
import numpy as np

# work with dataframes
import pandas as pd
import geopandas as gpd

# statistical analysis
import esda
from esda.moran import Moran
from splot.esda import moran_scatterplot, lisa_cluster, plot_local_autocorrelation

import pysal as ps 
from pysal.lib import weights 
from libpysal.io import open as psopen 
import libpysal 

# data visualisation
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# read in .csv file
file_name = 'data/Afghanistan.csv' # file path
df = pd.read_csv(file_name)
df.head()
df.tail()

In [None]:
# initial information csv
df.info()
df.describe()

In [None]:
# play around with parameters
df = pd.read_csv(file_name, index_col=['id']) # set personalised index to column 'id'
df = pd.read_csv(file_name, header=1) # first rows of dataset should be header
df = pd.read_csv(file_name, skiprows=1) # skips first 5 rows
df = pd.read_csv(file_name, skipfooter=1) # skips last row

In [None]:
#assign new column names
df = df.columns = ['Column name 1', 'Column name 2']

In [1]:
# read in .shp file
geo = gpd.read_file('data/IMD/lab04_imd.shp')

In [None]:
# initial information shp
geo.plot()
geo.dtypes

***CLEAN DATA***

In [None]:
# divide dataset
df = df[['Column 1', 'Column 2']]
# or
df = df.drop(['Column 1', 'Column 2'], axis = 1)

In [None]:
# delete rows with unknown
df = df.drop(['Row 1', 'Row 2'])

In [None]:
# remember to reset the index
df = df.reset_index()

In [None]:
# sort values
df.sort_values(by = ['column 1', 'column 2'], ascending = [False,True])

In [None]:
# aggregate data
df.groupby('year').agg({'rating' : ['count','min', 'max']})
# for each year in df, give the following attributes of rating: count, min , max

In [None]:
# merge datasets

In [None]:
# set empty values to NaN / 0

***OPTIMISE R^2***

In [None]:
# iterate to find best r^2

***STATISTICAL ANALYSIS***

In [None]:
# perform linear regression

In [None]:
# moran's i

In [None]:
# moran's plot

***VISUALISE RESULTS***

In [None]:
# line diagram
lineplot = sns.lineplot(df['Column name'])

In [None]:
# bar plot horizontal
barh = sns.barplot(df['Column name'], orient = 'h')

In [None]:
# bar plot vertical
barv = sns.barplot(df['Column name'], orient = 'v')

In [None]:
# scatter plot
scatterplot = sns.scatterplot(df['Column name'])

In [None]:
# histogramm
histogram = sns.histplot(df['Column name'])
histogram = sns.displot(df['Column name'])
plt.xticks(rotation = -45, fontsize = 10) # Rotate labels by 45 degrees

In [None]:
# kernel density plot
kde = sns.kdeplot(df['Column name'], fill = True)
plt.xticks(rotation = -45, fontsize = 10) # optional

In [None]:
# box plot
scatterplot = sns.boxplot(df['Column name'])

In [None]:
#violin plot
violinplot = sns.violinplot(df['Column name'])

***SAVE RESULTS***

In [None]:
# save as csv
geo.to_csv('foldername/filename.csv', sep =',', index = False)

In [None]:
# save as image

In [None]:
# save as pdf

***TIPPS AND TRICKS***

In [None]:
# print information
print('Cell value of "' + str(column_name) + '" and "' + str(row_name) + '" is: ' + str(value))