In [None]:
# Dependencies
import numpy as np
import pandas as pd
import requests
import pymongo
import json
from tqdm.notebook import tqdm
from kaggle.api.kaggle_api_extended import KaggleApi
from sqlalchemy import create_engine
from census import Census

# Google API Key
from config import gkey

# Census API Key
from config import census_key
c = Census(census_key, year=2017)

#You will need to do the following for the Kaggle logic to work
#run: pip install from gitbash or other command line tool
#Create a Kaggle account to receive Kaggle.json file with API Key information
#You will need to move this file to
    #For Window: C:\Users<Windows-username>.kaggle/Kaggle.jsaon
    #For Mac & linux: ~/.kaggle/Kaggle.jsaon

In [None]:
#workaround for MongoDB limitations: api call
api = KaggleApi()

In [None]:
#workaround for MongoDB limitations: api authentication
api.authenticate()

In [None]:
#workaround for MongoDB limitations: api authentication
api.dataset_download_files("sobhanmoosavi/us-accidents")

In [None]:
#Logic to unzip insurance zip file. This removes the manual concerns and file share limitation
import zipfile
zip_folder = 'C:/Users/luder/OneDrive/DataScienceBootCamp/Project2/us-accidents.zip' # You may to adjust this directory for your local environment 
destination = 'C:/Users/luder/OneDrive/DataScienceBootCamp/Project2/Resources'       # You may to adjust this directory for your local environment
#pwd = '<YOUR_PASSWORD>'                                                             # No password needed 

with zipfile.ZipFile(zip_folder) as zf:
    zf.extractall(
        destination)

In [None]:
# Read in the csv containing Accident data file, convert date string to datetime, parse year & month
# Also, limit zip code to 5 characters for better data matches when mergingwith other sources
Accidents_df = pd.read_csv("Resources/US_Accidents_Dec19.csv")
Accidents_df['Start_Time'] = Accidents_df['Start_Time'].astype('datetime64[ns]')
Accidents_df['Ent_Time'] = Accidents_df['Start_Time'].astype('datetime64[ns]')
Accidents_df['Weather_Timestamp'] = Accidents_df['Start_Time'].astype('datetime64[ns]')
Accidents_df['CrashYear'] = pd.DatetimeIndex(Accidents_df['Start_Time']).year
Accidents_df['CrashMonth'] = pd.DatetimeIndex(Accidents_df['Start_Time']).month
Accidents_df['Zip_code'] = Accidents_df['Zipcode'].str[:5]
Accidents_df.head()

In [None]:
Accidents_df.reset_index(drop = True, inplace = True)
Accidents_df.head()

In [None]:
Accidents_df.info()

In [None]:
#conn = "mongodb://localhost:27017" #Run this code if you want to load data to your local MongoDB
conn = "mongodb+srv://Luderoch:1234@accidentcluster.zvsni.mongodb.net/Accidentsdb?retryWrites=true&w=majority" # Run this code for MangoDB Atlas (cloud)
grouper = 10000
#Drops collection to avoid duplicates in database
#db.accidents.drop()

for idx in tqdm(range(len(Accidents_df )//grouper +1)):
   # Accidents_df.iloc[0:1001,3:]
    start = idx *grouper +1 if idx > 0 else idx * grouper
    end = start + grouper
    records = json.loads(Accidents_df.iloc[start:end,3:].to_json(orient = "table"))["data"]
    with pymongo.MongoClient(conn) as client:
        # Select database and collection to use
        db = client.US_Accidents
        accidents = db.accidents
        accidents.insert_many(records)

In [None]:
#Group data to calculate Summary Statistics & merge at State and or Zip code level with Census and other data
groupT_Accidents = Accidents_df.groupby(['CrashYear', 'CrashMonth','State','Zip_code'])
#grouped_Accidents = grouped_Accidents.reset_index()
groupT_Accidents.head(100)

In [None]:
#DO NOT RUN THIS CELL 
#Test_Accidents = Accidents_df.groupby(['CrashYear', 'CrashMonth','State','Zip_code']).agg(
#     {'Temperature(F)':['mean', 'min', 'max','median','std'],'Visibility(mi)':['mean', 'min', 'max','median','std']})                   
                                                                                                                   
# # ['mean', 'min', 'max','median','std']
# df.columns = ["_".join(x) for x in df.columns.ravel()] - Modify to simplify code in step below....change df to filename
# # grouped_Accidents = Accidents_df.groupby(['CrashYear', 'CrashMonth','State','Zip_code']).agg({'ID': ['count']})                                                                                            
# # grouped_Accidents.columns = ['Total_count']
# # grouped_Accidents = grouped_Accidents.reset_index()

# Test_Accidents.head(100)

In [None]:
#Data integrity checck using field counts
Total_Counts = groupT_Accidents['ID'].count()

Mean_Temp = groupT_Accidents['Temperature(F)'].mean()
Med_Temp = groupT_Accidents['Temperature(F)'].median()
Min_Temp = groupT_Accidents['Temperature(F)'].min()
Max_Temp = groupT_Accidents['Temperature(F)'].max()
Var_Temp = groupT_Accidents['Temperature(F)'].var()
STD_Temp = groupT_Accidents['Temperature(F)'].std()

Mean_Vis = groupT_Accidents['Visibility(mi)'].mean()
Med_Vis = groupT_Accidents['Visibility(mi)'].median()
Min_Vis = groupT_Accidents['Visibility(mi)'].min()
Max_Vis = groupT_Accidents['Visibility(mi)'].max()
Var_Vis = groupT_Accidents['Visibility(mi)'].var()
STD_Vis = groupT_Accidents['Visibility(mi)'].std()

Mean_Wind = groupT_Accidents['Wind_Speed(mph)'].mean()
Med_Wind = groupT_Accidents['Wind_Speed(mph)'].median()
Min_Wind = groupT_Accidents['Wind_Speed(mph)'].min()
Max_Wind = groupT_Accidents['Wind_Speed(mph)'].max()
Var_Wind = groupT_Accidents['Wind_Speed(mph)'].var()
STD_Wind = groupT_Accidents['Wind_Speed(mph)'].std()

Mean_Precip = groupT_Accidents['Precipitation(in)'].mean()
Med_Precip = groupT_Accidents['Precipitation(in)'].median()
Min_Precip = groupT_Accidents['Precipitation(in)'].min()
Max_Precip = groupT_Accidents['Precipitation(in)'].max()
Var_Precip = groupT_Accidents['Precipitation(in)'].var()
STD_Precip = groupT_Accidents['Precipitation(in)'].std()

SummaryStats_df = pd.DataFrame({"Avg Temp(F)": Mean_Temp,
                                "Median Temp(F)": Med_Temp,
                                "Min Temp(F)": Min_Temp,
                                "Max Temp(F)": Max_Temp,
                                "Temp Var": Var_Temp,
                                "Temp Std": STD_Temp,
                                "Avg Visibility(mi)": Mean_Vis,
                                "Median Visibility(mi)": Med_Vis,
                                "Min Visibility(mi)": Min_Vis,
                                "Max Visibility(mi)": Max_Vis,
                                "Visibility(mi) Var": Var_Vis,
                                "Visibility(mi) Std": STD_Vis,
                                "Avg Windspeed(mph)": Mean_Wind,
                                "Median Windspeed(mph)": Med_Wind,
                                "Min Windspeed(mph)": Min_Wind,
                                "Max Windspeed(mph)": Max_Wind,
                                "Windspeed(mph) Var": Var_Wind,
                                "Windspeed(mph) Std": STD_Wind,
                                "Avg Precip(in)": Mean_Precip,
                                "Median Precip(in))": Med_Precip,
                                "Min Precip(in)": Min_Precip,
                                "Max Precip(in)": Max_Precip,
                                "Precip(in) Var": Var_Precip,
                                "Precip(in) Std": STD_Precip,
                                "Total Counts": Total_Counts
                               })


SummaryStats_df = SummaryStats_df.reset_index()


SummaryStats_df.head(20)

In [None]:
#Sunnary Statistic meta data
SummaryStats_df.info()

In [None]:
#
#conn = "mongodb://localhost:27017" #Run this code if you want to load data to your local MongoDB
conn = "mongodb+srv://Luderoch:1234@accidentcluster.zvsni.mongodb.net/Accidentsdb?retryWrites=true&w=majority" # Run this code for MangoDB Atlas (cloud)
grouper = 10000
#Drops collection to avoid duplicates in database
db.CrashSummary.drop()

for idx in tqdm(range(len(SummaryStats_df)//grouper +1)):
   # Accidents_df.iloc[0:1001,3:]
    start = idx *grouper +1 if idx > 0 else idx * grouper
    end = start + grouper
    records_Summary = json.loads(SummaryStats_df.iloc[start:end,:].to_json(orient = "table"))["data"]
    with pymongo.MongoClient(conn) as client:
        # Select database and collection to use
        db = client.US_Accidents
        CrashSummary = db.CrashSummary
        CrashSummary.insert_many(records_Summary)

In [None]:
# records_Summary = json.loads(SummaryStats_df.to_json(orient = "table"))["data"]

In [None]:
# # Setup connection to mongodb
# conn = "mongodb://localhost:27017" #Run this code if you want to load data to your local MongoDB
# #conn = "mongodb+srv://Luderoch:1234@accidentcluster.zvsni.mongodb.net/Accidentsdb?retryWrites=true&w=majority" # Run this code for MangoDB Atlas (cloud)
# client = pymongo.MongoClient(conn)

# # Select database and collection to use
# db = client.US_Accidents

# #Drops collection to avoid duplicates in database
# db.CrashSummary.drop()

# CrashSummary = db.CrashSummary

# CrashSummary.insert_many(records_Summary)