# Setup and Data mining

In [None]:
# display all columns when accessing a df
pd.options.display.max_columns = None

In [None]:
# sets seaborn style with linewidths etc
sns.set(style="white", palette="pastel",font_scale=1.5, rc={"lines.linewidth": 2.5,'figure.figsize':(11.7,8.27)})

In [None]:
# make a list of all files matching the pattern (alls csvs)
path = r'file to csvs' # use your path
all_files = glob.glob(path + "/*.csv")

# Data Processing and Sorting

In [None]:
#show non null entries in column
df[np.invert(df.column.isnull())].column.head()

In [None]:
# groups one variable by another, counts the values and normalizes for each grouping element, useful if you want % values in countplot instead of absolute
df["target"].groupby(df["grouping"]).value_counts(normalize=True).rename("prob").reset_index()

In [42]:
# remove outlier with z score over 3

from scipy import stats
z = np.abs(stats.zscore(df1))
df1 = df1[(z < 3).all(axis=1)]

# Plotting

In [None]:
# Legend Outside of Figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fancybox=True)

In [None]:
# make specific graph invisible (useful for emptysubplots), works also for legends etc
ax[0][0].set_visible(False)

## Plotly Maps

In [None]:
# plotly map code

fig = px.scatter_mapbox(data, lat="lat", lon="long", 
                        hover_name="name",
                        hover_data=["column1","column2"],
                        zoom=1, 
                        center={"lat":51.1, "lon":10.3},
                        color_continuous_scale=px.colors.sequential.Jet,
                        height=500, color="column3",
                        mapbox_style="open-street-map",opacity=1,
                        size="column4", size_max=10)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
# plotly annotation

fig.update_layout(
    showlegend=False,
    annotations=[
        dict(
            x=0.13,
            y=0,
            text="annotation string",
            font=dict(
                color="black",
                size=75
            )
        )
    ]
    )

In [None]:
# save plotly    
fig.write_image("figs/map"+str(year)+".png", width=1000, height=700, scale =3)

In [43]:
# correlation matrix dirk style

plt.figure (figsize=(12,12))
mask = np.triu(df_model.corr())
ax = sns.heatmap(round(df_model.corr()*100)
                 ,annot=True
                 ,mask=mask
                 ,cmap='coolwarm'
                 ,center=0)
ax.set_ylim((0,10))
ax.set_xlim((0,10));

In [None]:
# pretty confusion matrix

cm = confusion_matrix(y_test, model.predict(X_test))
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # normalize the data

# view with a heatmap
plt.figure(i)
sns.heatmap(cm, annot=True, annot_kws={"size":30}, 
        cmap='Blues', square=True, fmt='.3f')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.ylim(0,2)
plt.title('Confusion matrix for:\n{}'.format(model.__class__.__name__));

# Pipelines

In [None]:
# preprocessing pipeline dirk style

imputer1 = SimpleImputer(strategy="median")
scaler1 = StandardScaler()
numeric_transformer = make_pipeline(imputer1,scaler1)


imputer2 = SimpleImputer(strategy='constant', fill_value='missing')
encoder = OneHotEncoder(handle_unknown='error',drop="first")
object_transformer = make_pipeline(imputer2,encoder)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', object_transformer, object_features)
    ])

In [None]:
# feature selection

reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('poly', PolynomialFeatures(1)),
                      ('feature_selection', SelectFromModel(
                          estimator=RandomForestRegressor(max_depth=2,
                                                random_state=0
                                                ,n_estimators=1000
                                               ),
                      threshold='mean'
                      )),
                      ('regressor',
                       LassoCV(cv=5, random_state=0)
                      )])