In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = {
    "data1": pd.read_csv("../files/data1.tsv", sep="\t", header=None, index_col=False, names=["x", "y"]),
    "data2": pd.read_csv("../files/data2.tsv", sep="\t", header=None, index_col=False, names=["x", "y"]),
    "data3": pd.read_csv("../files/data3.tsv", sep="\t", header=None, index_col=False, names=["x", "y"]),
    "data4": pd.read_csv("../files/data4.tsv", sep="\t", header=None, index_col=False, names=["x", "y"]),
}

## Calculating mean

In [None]:
for k, d in data.items():
    print(k)
    print(f"x={np.mean(d.x):.2f}, y={np.mean(d.y):.2f}")

## Calculating 

In [None]:
for k, d in data.items():
    print(k)
    print(f"x={np.var(d.x):.3f}, y={np.var(d.y):.3f}")

In [None]:
for k, d in data.items():
    print(k)
    print(f"Pearrson cov = {np.corrcoef(d.x, d.y)}")

## Linear Regression

In [None]:
from scipy import stats

In [None]:
for k, d in data.items():
    slope, intercept, r_value, p_value, std_err = stats.linregress(d.x, d.y)
    print(k)
    print(f"{slope=:.2f}, {intercept=:.2f}")

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10, 10))

cols = ["b", "r", "g", "y"]
for (k, d), ax, col in zip(data.items(), axs.flatten(), cols):
    ax.scatter(d.x, d.y, color=col)

    slope, intercept, r_value, p_value, std_err = stats.linregress(d.x, d.y)
    x = np.linspace(np.min(d.x), np.max(d.x))
    y = slope * x + intercept
    ax.plot(x, y, "black")
    ax.set_title(k)


for ax in axs.flatten():
    ax.set_xlabel("x")
    ax.set_ylabel("y")
plt.tight_layout()

## Part 1.2
*Excercise 1.2:* Questions for the lecture
* What is the difference between *data* and *metadata*? How does that relate to the GPS tracks-example?
    * Data is concrete, metadata is data about the data
* Sune says that the human eye is a great tool for data analysis. Do you agree? Explain why/why not. Mention something that the human eye is very good at. Can you think of something that [is difficult for the human eye](http://cdn.ebaumsworld.com/mediaFiles/picture/718392/84732652.jpg). Explain why your example is difficult. 
    * I agree, its real good. Good at spotting patterns, can be confused.
* Simpson's paradox is hard to explain. Come up with your own example - or find one on line.
    * The schoolsystem in America
* In your own words, explain the differnece between *exploratory* and *explanatory* data analysis. 
    * Explore the data or explain the data

In [None]:
df = pd.read_csv("../data/Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv")
focuscrimes = set(['WEAPON LAWS', 'PROSTITUTION', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'BURGLARY', 'ASSAULT', 'DRUNKENNESS', 'DRUG/NARCOTIC', 'TRESPASS', 'LARCENY/THEFT', 'VANDALISM', 'VEHICLE THEFT', 'STOLEN PROPERTY', 'DISORDERLY CONDUCT'])

In [None]:
df["DayName"] = pd.to_datetime(df.Date).dt.day_name()
df["WeekIdx"] = pd.to_datetime(df.Date).dt.weekday
df["Month"] = pd.to_datetime(df.Date).dt.month
df["Hour"] = pd.to_datetime(df.Time).dt.hour

In [None]:
fig, axs = plt.subplots(7, 2, figsize=(10, 20))
for (idx, group), ax in zip(df[df.Category.isin(focuscrimes)].groupby("Category"), axs.flatten()): 
    group[["WeekIdx", "DayName"]]\
        .value_counts()\
        .sort_index()\
        .plot.bar(ax=ax, title=group.Category.values[0])

plt.tight_layout()

In [None]:
fig, axs = plt.subplots(7, 2, figsize=(10, 20))
for (idx, group), ax in zip(df[df.Category.isin(focuscrimes)].groupby("Category"), axs.flatten()): 
    val_counts = group[["Month"]]\
        .value_counts()\
        .sort_index()\
        .plot.bar(ax=ax, title=group.Category.values[0])

plt.tight_layout()

In [None]:
fig, axs = plt.subplots(7, 2, figsize=(10, 20))
for (idx, group), ax in zip(df[df.Category.isin(focuscrimes)].groupby("Category"), axs.flatten()): 
    val_counts = group[["Hour"]]\
        .value_counts()\
        .sort_index()\
        .plot.bar(ax=ax, title=group.Category.values[0])

plt.tight_layout()

In [None]:
fig, axs = plt.subplots(7, 2, figsize=(10, 20))
for (_, group), ax in zip(df[df.Category.isin(focuscrimes)].groupby("Category"), axs.flatten()): 
    val_counts = group[["WeekIdx", "Hour"]]\
        .value_counts()\
        .sort_index()\
        .plot.bar(ax=ax, title=group.Category.values[0])
    ax.set_xticks([x for x in range(0, 169, 12)])
    ax.set_xticklabels([f"{x%24}" for x in range(0, 169, 12)], rotation=0)

plt.tight_layout()

In [None]:
print("10 Districts")
print(df.PdDistrict.unique())

In [None]:
print("Number of crimes in each district")
print(df.groupby("PdDistrict").size().sort_values(ascending=False))

print("Number of focus crimes")
print(df[df.Category.isin(focuscrimes)].groupby("PdDistrict").size().sort_values(ascending=False))

In [None]:
p_crime = df.Category.value_counts(normalize=True)
p_crime_districts = df.groupby("PdDistrict").Category.value_counts(normalize=True)

fig, axs = plt.subplots(5, 2, figsize=(10, 30))

for (area, new_df), ax in zip(p_crime_districts.groupby(level=0), axs.flatten()):
    new_df.index = new_df.index.get_level_values(1)
    ps = (new_df / p_crime)
    ps[ps.index.isin(focuscrimes)].plot.bar(title=area, ax=ax)

plt.tight_layout()

In [None]:
randomdata = {
    'CENTRAL': 0.8903601342256143,
    'SOUTHERN': 0.8642882941363439,
    'BAYVIEW': 0.925634097746596,
    'MISSION': 0.7369022697287458,
    'PARK': 0.9864113307070926,
    'RICHMOND': 0.5422239624697017,
    'INGLESIDE': 0.5754056712571605,
    'TARAVAL': 0.5834730737348696,
    'NORTHERN': 0.08148199528212985,
    'TENDERLOIN': 0.37014287986350447}

df_random = pd.DataFrame.from_dict(randomdata, orient="index").reset_index()
df_random.columns = ["DISTRICT", "DATA"]

# Normalizing between 0 and 12
df_random.DATA = df_random.DATA.transform(lambda x: (x / x.max()) * 12)
df_random

In [None]:
import json
import plotly.express as px

In [None]:
counties = json.loads(open("../files/sfpd.geojson", "r").read())

fig = px.choropleth_mapbox(df_random, geojson=counties, locations="DISTRICT", color="DATA", color_continuous_scale="Viridis", range_color=(0, 12),
                           mapbox_style="carto-positron",
                           zoom=10, center = {"lat": 37.773972, "lon": -122.431297},
                           opacity=0.5,
                           labels={'unemp':'unemployment rate'})
fig.show()

In [None]:
data = pd.DataFrame(df[(df.Category == "VEHICLE THEFT") & (df.WeekIdx == 1)].PdDistrict.value_counts(normalize=True)).reset_index()
data.columns = ["DISTRICT", "DATA"]
data.DATA = data.DATA.transform(lambda x: (x / x.max()) * 12)

In [None]:
counties = json.loads(open("../files/sfpd.geojson", "r").read())

fig = px.choropleth_mapbox(data, geojson=counties, locations="DISTRICT", color="DATA", color_continuous_scale="Viridis", 
                           range_color=(0, 12),
                           mapbox_style="carto-positron",
                           zoom=10, center = {"lat": 37.773972, "lon": -122.431297},
                           opacity=0.5,
                           labels={'unemp':'unemployment rate'})
fig.show()