This notebook connects to AWS Postgres Crime Database which was created in the AWS Postgres terminal script, then it creates empty tables corresponding to the datasets that will be used to analyize the crime dataset of Los Angeles. the datasets include_LA crime data (taking the years 2012-2017), LA demographics data, LA weather data (temperature, humidity, pressure, wind) and LA median house listing prices.

Once the tables are created in the AWS Postgres database, then we use 

=# COPY < table_name > FROM <'csv file location in aws'> DELIMITER ',' CSV HEADER;''

command to copy the rest of the data into the empty tables
    

In [1]:
import datetime
import ast

In [20]:
# Get pandas and postgres to work together
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql

# We are also going to do some basic viz
import matplotlib.pyplot as plt
%matplotlib inline 



connection_args = {
    'host': '54.185.23.30',  # We are connecting to our _local_ version of psql
    'user': 'ubuntu',
    'dbname': 'crime',    # DB that we are connecting to
    'port': 5432          # port we opened on AWS
}

# We will talk about this magic Python trick!
connection = pg.connect(**connection_args)

In [3]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://ubuntu:ubuntu@54.185.23.30:5432/crime')

In [37]:
#arrest_2010_present_df = pd.read_csv('/Users/aminenhila/Desktop/Metis/Project3/los-angeles-crime-arrest-data/arrest-data-from-2010-to-present.csv')
#arrest_2010_present_df.iloc[:0].to_sql('arrest_2010_present', con = engine)


Prepare the crime data a bit before pushing to aws database

In [4]:
crime_2010_present_df = pd.read_csv('/Users/aminenhila/Desktop/Metis/Project3/los-angeles-crime-arrest-data/crime-data-from-2010-to-present.csv')


In [5]:
# strip the columns of extra space
crime_2010_present_df.columns =crime_2010_present_df.columns.str.strip()


In [6]:
crime_2010_present_df.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Weapon Description,Status Code,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location
0,102005556,2010-01-25T00:00:00,2010-01-22T00:00:00,2300,20,Olympic,2071,510,VEHICLE - STOLEN,,...,,IC,Invest Cont,510.0,,,,VAN NESS,15TH,"{'latitude': '34.0454', 'needs_recoding': Fals..."
1,101822289,2010-11-11T00:00:00,2010-11-10T00:00:00,1800,18,Southeast,1803,510,VEHICLE - STOLEN,,...,,IC,Invest Cont,510.0,,,,88TH,WALL,"{'latitude': '33.9572', 'needs_recoding': Fals..."
2,101105609,2010-01-28T00:00:00,2010-01-27T00:00:00,2230,11,Northeast,1125,510,VEHICLE - STOLEN,,...,,IC,Invest Cont,510.0,,,,YORK,AVENUE 51,"{'latitude': '34.1211', 'needs_recoding': Fals..."
3,101620051,2010-11-11T00:00:00,2010-11-07T00:00:00,1600,16,Foothill,1641,510,VEHICLE - STOLEN,,...,,IC,Invest Cont,510.0,,,,EL DORADO,TRUESDALE,"{'latitude': '34.241', 'needs_recoding': False..."
4,101910498,2010-04-07T00:00:00,2010-04-07T00:00:00,1600,19,Mission,1902,510,VEHICLE - STOLEN,,...,,IC,Invest Cont,510.0,,,,GLENOAKS,DRELL,"{'latitude': '34.3147', 'needs_recoding': Fals..."


In [7]:
# change the dates to datetime objects 
crime_2010_present_df['Date Occurred'] = pd.to_datetime(crime_2010_present_df['Date Occurred'])


In [8]:
# limit the years between 2012 and 2017
smaller_crime_2010_present_df = crime_2010_present_df[(pd.DatetimeIndex(crime_2010_present_df['Date Occurred']).year >=2012)]
smaller_crime_2010_present_df = smaller_crime_2010_present_df[pd.DatetimeIndex(smaller_crime_2010_present_df['Date Occurred']).year <=2017]


In [9]:
# change the dates to datetime objects 
smaller_crime_2010_present_df['Date Reported'] = pd.to_datetime(smaller_crime_2010_present_df['Date Reported'])


In [10]:
smaller_crime_2010_present_df.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Weapon Description,Status Code,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location
409260,605,2012-06-07,2012-06-06,2300,5,Harbor,518,510,VEHICLE - STOLEN,,...,,IC,Invest Cont,510.0,,,,24600 AVALON BL,,"{'latitude': '33.8028', 'needs_recoding': Fals..."
409261,110322412,2012-10-06,2012-09-18,1200,3,Southwest,328,662,"BUNCO, GRAND THEFT",0701,...,,AA,Adult Arrest,662.0,,,,700 W 27TH ST,,"{'latitude': '34.0278', 'needs_recoding': Fals..."
409262,110923275,2012-01-06,2012-01-06,1530,15,N Hollywood,1512,510,VEHICLE - STOLEN,,...,,AA,Adult Arrest,510.0,,,,ALCOVE AV,VOSE ST,"{'latitude': '34.1985', 'needs_recoding': Fals..."
409263,112023635,2012-01-17,2012-01-17,2140,11,Northeast,1143,510,VEHICLE - STOLEN,1402 1309 0916 0342,...,,AA,Adult Arrest,510.0,998.0,,,3800 TRACY ST,,"{'latitude': '34.1059', 'needs_recoding': Fals..."
409264,120100001,2012-03-13,2012-03-12,2000,1,Central,101,510,VEHICLE - STOLEN,,...,,IC,Invest Cont,510.0,,,,1200 W SUNSET BL,,"{'latitude': '34.0682', 'needs_recoding': Fals..."


### Clean the Location column in pandas 

Think how to do in sql

In [11]:
smaller_crime_2010_present_df['Location'] = smaller_crime_2010_present_df['Location'].apply(ast.literal_eval)

In [12]:
# functions to access the lat and long information in the 
def get_latitude(location):
    return location['latitude']

def get_longitude(location):
    return location['longitude']
    

In [13]:
# accessing the lat and long 
smaller_crime_2010_present_df['Longitude'] = smaller_crime_2010_present_df.Location.apply(get_longitude).astype(float)
smaller_crime_2010_present_df['Latitude'] = smaller_crime_2010_present_df.Location.apply(get_latitude).astype(float)


Change the Time column to whole hour, we will change this to timestamp later

In [14]:
smaller_crime_2010_present_df['Time Occurred'] = smaller_crime_2010_present_df['Time Occurred'].astype(str).str.zfill(4)


In [15]:
smaller_crime_2010_present_df['Hour_Occurred'] = pd.to_datetime(smaller_crime_2010_present_df['Time Occurred'],format='%H%M').dt.hour


In [16]:
smaller_crime_2010_present_df['Time_Occurred_with_hour'] = smaller_crime_2010_present_df['Date Occurred']+pd.to_timedelta(smaller_crime_2010_present_df.Hour_Occurred, unit='h')


In [17]:
smaller_crime_2010_present_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1257800 entries, 409260 to 1667059
Data columns (total 30 columns):
DR Number                  1257800 non-null int64
Date Reported              1257800 non-null datetime64[ns]
Date Occurred              1257800 non-null datetime64[ns]
Time Occurred              1257800 non-null object
Area ID                    1257800 non-null int64
Area Name                  1257800 non-null object
Reporting District         1257800 non-null int64
Crime Code                 1257800 non-null int64
Crime Code Description     1257800 non-null object
MO Codes                   1121494 non-null object
Victim Age                 1257800 non-null int64
Victim Sex                 1140726 non-null object
Victim Descent             1140697 non-null object
Premise Code               1257775 non-null float64
Premise Description        1257772 non-null object
Weapon Used Code           413826 non-null float64
Weapon Description         413826 non-null object
Stat

In [18]:
smaller_crime_2010_present_df.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location,Longitude,Latitude,Hour_Occurred,Time_Occurred_with_hour
409260,605,2012-06-07,2012-06-06,2300,5,Harbor,518,510,VEHICLE - STOLEN,,...,,,,24600 AVALON BL,,"{'latitude': '33.8028', 'needs_recoding': Fals...",-118.264,33.8028,23,2012-06-06 23:00:00
409261,110322412,2012-10-06,2012-09-18,1200,3,Southwest,328,662,"BUNCO, GRAND THEFT",0701,...,,,,700 W 27TH ST,,"{'latitude': '34.0278', 'needs_recoding': Fals...",-118.2783,34.0278,12,2012-09-18 12:00:00
409262,110923275,2012-01-06,2012-01-06,1530,15,N Hollywood,1512,510,VEHICLE - STOLEN,,...,,,,ALCOVE AV,VOSE ST,"{'latitude': '34.1985', 'needs_recoding': Fals...",-118.4126,34.1985,15,2012-01-06 15:00:00
409263,112023635,2012-01-17,2012-01-17,2140,11,Northeast,1143,510,VEHICLE - STOLEN,1402 1309 0916 0342,...,998.0,,,3800 TRACY ST,,"{'latitude': '34.1059', 'needs_recoding': Fals...",-118.2755,34.1059,21,2012-01-17 21:00:00
409264,120100001,2012-03-13,2012-03-12,2000,1,Central,101,510,VEHICLE - STOLEN,,...,,,,1200 W SUNSET BL,,"{'latitude': '34.0682', 'needs_recoding': Fals...",-118.2502,34.0682,20,2012-03-12 20:00:00


### save the new smaller table

In [19]:
smaller_crime_2010_present_df.iloc[:0].to_sql('crime_2012_2017', con = engine, index = False)


In [20]:
smaller_crime_2010_present_df.to_csv('/Users/aminenhila/Desktop/Metis/Project3/Crime_data_2012_2017.csv', index=False ,sep = ',')


Upload the rest of the datasets to aws

### City attributes (Lat and Long, LA is one of the cities)

In [21]:
city_attributes_df = pd.read_csv('/Users/aminenhila/Desktop/Metis/Project3/historical-hourly-weather-data/city_attributes.csv')


In [23]:
city_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 4 columns):
City         36 non-null object
Country      36 non-null object
Latitude     36 non-null float64
Longitude    36 non-null float64
dtypes: float64(2), object(2)
memory usage: 1.2+ KB


In [24]:
city_attributes_df.head()

Unnamed: 0,City,Country,Latitude,Longitude
0,Vancouver,Canada,49.24966,-123.119339
1,Portland,United States,45.523449,-122.676208
2,San Francisco,United States,37.774929,-122.419418
3,Seattle,United States,47.606209,-122.332069
4,Los Angeles,United States,34.052231,-118.243683


In [53]:
city_attributes_df.iloc[:0].to_sql('city_attributes', con = engine, index = False)


### Humidity for different cities including LA

In [4]:
humiduty_df = pd.read_csv('/Users/aminenhila/Desktop/Metis/Project3/historical-hourly-weather-data/humidity.csv')


In [5]:
humiduty_df.head()

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01 12:00:00,,,,,,,,,,...,,,,,,,25.0,,,
1,2012-10-01 13:00:00,76.0,81.0,88.0,81.0,88.0,82.0,22.0,23.0,50.0,...,71.0,58.0,93.0,68.0,50.0,63.0,22.0,51.0,51.0,50.0
2,2012-10-01 14:00:00,76.0,80.0,87.0,80.0,88.0,81.0,21.0,23.0,49.0,...,70.0,57.0,91.0,68.0,51.0,62.0,22.0,51.0,51.0,50.0
3,2012-10-01 15:00:00,76.0,80.0,86.0,80.0,88.0,81.0,21.0,23.0,49.0,...,70.0,57.0,87.0,68.0,51.0,62.0,22.0,51.0,51.0,50.0
4,2012-10-01 16:00:00,77.0,80.0,85.0,79.0,88.0,81.0,21.0,23.0,49.0,...,69.0,57.0,84.0,68.0,52.0,62.0,22.0,51.0,51.0,50.0


In [6]:
humiduty_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
datetime             45253 non-null object
Vancouver            43427 non-null float64
Portland             44804 non-null float64
San Francisco        44311 non-null float64
Seattle              44964 non-null float64
Los Angeles          45101 non-null float64
San Diego            44909 non-null float64
Las Vegas            44411 non-null float64
Phoenix              43945 non-null float64
Albuquerque          44543 non-null float64
Denver               43445 non-null float64
San Antonio          44689 non-null float64
Dallas               44934 non-null float64
Houston              45132 non-null float64
Kansas City          44741 non-null float64
Minneapolis          44743 non-null float64
Saint Louis          43964 non-null float64
Chicago              44144 non-null float64
Nashville            44686 non-null float64
Indianapolis         44558 non-null float64
Atlanta     

In [8]:
humiduty_df['datetime'] = pd.to_datetime(humiduty_df.datetime)

In [9]:
humiduty_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
datetime             45253 non-null datetime64[ns]
Vancouver            43427 non-null float64
Portland             44804 non-null float64
San Francisco        44311 non-null float64
Seattle              44964 non-null float64
Los Angeles          45101 non-null float64
San Diego            44909 non-null float64
Las Vegas            44411 non-null float64
Phoenix              43945 non-null float64
Albuquerque          44543 non-null float64
Denver               43445 non-null float64
San Antonio          44689 non-null float64
Dallas               44934 non-null float64
Houston              45132 non-null float64
Kansas City          44741 non-null float64
Minneapolis          44743 non-null float64
Saint Louis          43964 non-null float64
Chicago              44144 non-null float64
Nashville            44686 non-null float64
Indianapolis         44558 non-null float64
Atla

In [11]:
humiduty_df.iloc[:0].to_sql('humidity', con = engine, index = False)

In [12]:
humiduty_df.to_csv('/Users/aminenhila/Desktop/Metis/Project3/humidity.csv', index=False ,sep = ',')

### Pressure for different cities including LA

In [10]:
pressure_df = pd.read_csv('/Users/aminenhila/Desktop/Metis/Project3/historical-hourly-weather-data/pressure.csv')


In [30]:
pressure_df.head()

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01 12:00:00,,,,,,,,,,...,,,,,,,1011.0,,,
1,2012-10-01 13:00:00,,1024.0,1009.0,1027.0,1013.0,1013.0,1018.0,1013.0,1024.0,...,1014.0,1012.0,1001.0,1014.0,984.0,1012.0,1010.0,1013.0,1013.0,990.0
2,2012-10-01 14:00:00,,1024.0,1009.0,1027.0,1013.0,1013.0,1018.0,1013.0,1024.0,...,1014.0,1012.0,986.0,1014.0,984.0,1012.0,1010.0,1013.0,1013.0,990.0
3,2012-10-01 15:00:00,,1024.0,1009.0,1028.0,1013.0,1013.0,1018.0,1013.0,1024.0,...,1014.0,1012.0,945.0,1014.0,984.0,1012.0,1010.0,1013.0,1013.0,990.0
4,2012-10-01 16:00:00,,1024.0,1009.0,1028.0,1013.0,1013.0,1018.0,1013.0,1024.0,...,1014.0,1012.0,904.0,1014.0,984.0,1012.0,1010.0,1013.0,1013.0,990.0


In [31]:
pressure_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
datetime             45253 non-null object
Vancouver            41019 non-null float64
Portland             45249 non-null float64
San Francisco        44438 non-null float64
Seattle              45240 non-null float64
Los Angeles          45001 non-null float64
San Diego            45078 non-null float64
Las Vegas            45165 non-null float64
Phoenix              44659 non-null float64
Albuquerque          44797 non-null float64
Denver               44710 non-null float64
San Antonio          45236 non-null float64
Dallas               45193 non-null float64
Houston              45244 non-null float64
Kansas City          45132 non-null float64
Minneapolis          45236 non-null float64
Saint Louis          45185 non-null float64
Chicago              44528 non-null float64
Nashville            45106 non-null float64
Indianapolis         44964 non-null float64
Atlanta     

In [56]:
pressure_df.iloc[:0].to_sql('pressure', con = engine, index = False)

### Temperature for different cities including LA

In [13]:
temperature_df = pd.read_csv('/Users/aminenhila/Desktop/Metis/Project3/historical-hourly-weather-data/temperature.csv')


In [14]:
temperature_df.head()

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01 12:00:00,,,,,,,,,,...,,,,,,,309.1,,,
1,2012-10-01 13:00:00,284.63,282.08,289.48,281.8,291.87,291.53,293.41,296.6,285.12,...,285.63,288.22,285.83,287.17,307.59,305.47,310.58,304.4,304.4,303.5
2,2012-10-01 14:00:00,284.629041,282.083252,289.474993,281.797217,291.868186,291.533501,293.403141,296.608509,285.154558,...,285.663208,288.247676,285.83465,287.186092,307.59,304.31,310.495769,304.4,304.4,303.5
3,2012-10-01 15:00:00,284.626998,282.091866,289.460618,281.789833,291.862844,291.543355,293.392177,296.631487,285.233952,...,285.756824,288.32694,285.84779,287.231672,307.391513,304.281841,310.411538,304.4,304.4,303.5
4,2012-10-01 16:00:00,284.624955,282.100481,289.446243,281.782449,291.857503,291.553209,293.381213,296.654466,285.313345,...,285.85044,288.406203,285.860929,287.277251,307.1452,304.238015,310.327308,304.4,304.4,303.5


In [15]:
temperature_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
datetime             45253 non-null object
Vancouver            44458 non-null float64
Portland             45252 non-null float64
San Francisco        44460 non-null float64
Seattle              45250 non-null float64
Los Angeles          45250 non-null float64
San Diego            45252 non-null float64
Las Vegas            45252 non-null float64
Phoenix              45250 non-null float64
Albuquerque          45252 non-null float64
Denver               45252 non-null float64
San Antonio          45252 non-null float64
Dallas               45249 non-null float64
Houston              45250 non-null float64
Kansas City          45252 non-null float64
Minneapolis          45240 non-null float64
Saint Louis          45252 non-null float64
Chicago              45250 non-null float64
Nashville            45251 non-null float64
Indianapolis         45246 non-null float64
Atlanta     

In [16]:
temperature_df['datetime'] = pd.to_datetime(temperature_df.datetime)

In [17]:
temperature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
datetime             45253 non-null datetime64[ns]
Vancouver            44458 non-null float64
Portland             45252 non-null float64
San Francisco        44460 non-null float64
Seattle              45250 non-null float64
Los Angeles          45250 non-null float64
San Diego            45252 non-null float64
Las Vegas            45252 non-null float64
Phoenix              45250 non-null float64
Albuquerque          45252 non-null float64
Denver               45252 non-null float64
San Antonio          45252 non-null float64
Dallas               45249 non-null float64
Houston              45250 non-null float64
Kansas City          45252 non-null float64
Minneapolis          45240 non-null float64
Saint Louis          45252 non-null float64
Chicago              45250 non-null float64
Nashville            45251 non-null float64
Indianapolis         45246 non-null float64
Atla

In [28]:
temperature_df.to_csv('temperature1.csv', index=False ,sep = ',')

In [29]:
temperature_df.iloc[:0].to_sql('temperature', con = engine, index = False)

### Weather describtion for different cities including LA

In [36]:
weather_desc_df = pd.read_csv('/Users/aminenhila/Desktop/Metis/Project3/historical-hourly-weather-data/weather_description.csv')


In [37]:
weather_desc_df.head()

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01 12:00:00,,,,,,,,,,...,,,,,,,haze,,,
1,2012-10-01 13:00:00,mist,scattered clouds,light rain,sky is clear,mist,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,overcast clouds,sky is clear,sky is clear,sky is clear,haze,sky is clear,sky is clear,sky is clear
2,2012-10-01 14:00:00,broken clouds,scattered clouds,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,sky is clear,few clouds,sky is clear,sky is clear,broken clouds,overcast clouds,sky is clear,overcast clouds
3,2012-10-01 15:00:00,broken clouds,scattered clouds,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,sky is clear,few clouds,overcast clouds,sky is clear,broken clouds,overcast clouds,overcast clouds,overcast clouds
4,2012-10-01 16:00:00,broken clouds,scattered clouds,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,sky is clear,few clouds,overcast clouds,sky is clear,broken clouds,overcast clouds,overcast clouds,overcast clouds


In [38]:
weather_desc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
datetime             45253 non-null object
Vancouver            44460 non-null object
Portland             45252 non-null object
San Francisco        44460 non-null object
Seattle              45252 non-null object
Los Angeles          45252 non-null object
San Diego            45252 non-null object
Las Vegas            45252 non-null object
Phoenix              45252 non-null object
Albuquerque          45252 non-null object
Denver               45252 non-null object
San Antonio          45252 non-null object
Dallas               45252 non-null object
Houston              45252 non-null object
Kansas City          45252 non-null object
Minneapolis          45252 non-null object
Saint Louis          45252 non-null object
Chicago              45252 non-null object
Nashville            45252 non-null object
Indianapolis         45252 non-null object
Atlanta              45252 non-

In [62]:
weather_desc_df.iloc[:0].to_sql('weather_desc', con = engine, index = False)

### Wind direction for different cities including LA

In [39]:
wind_direction_df = pd.read_csv('/Users/aminenhila/Desktop/Metis/Project3/historical-hourly-weather-data/wind_direction.csv')


In [40]:
wind_direction_df.head()

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01 12:00:00,,,,,,,,,,...,,,,,,,360.0,,,
1,2012-10-01 13:00:00,0.0,0.0,150.0,0.0,0.0,0.0,0.0,10.0,360.0,...,270.0,260.0,230.0,60.0,135.0,101.0,30.0,336.0,336.0,329.0
2,2012-10-01 14:00:00,6.0,4.0,147.0,2.0,0.0,0.0,8.0,9.0,360.0,...,270.0,260.0,230.0,60.0,157.0,315.0,30.0,336.0,336.0,329.0
3,2012-10-01 15:00:00,20.0,18.0,141.0,10.0,0.0,0.0,23.0,9.0,360.0,...,271.0,260.0,231.0,60.0,157.0,307.0,30.0,336.0,336.0,329.0
4,2012-10-01 16:00:00,34.0,31.0,135.0,17.0,0.0,0.0,37.0,9.0,360.0,...,272.0,260.0,233.0,60.0,157.0,294.0,30.0,336.0,336.0,329.0


In [44]:
wind_direction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
datetime             45253 non-null object
Vancouver            44458 non-null float64
Portland             45252 non-null float64
San Francisco        44459 non-null float64
Seattle              45252 non-null float64
Los Angeles          45252 non-null float64
San Diego            45252 non-null float64
Las Vegas            45248 non-null float64
Phoenix              45252 non-null float64
Albuquerque          45252 non-null float64
Denver               45252 non-null float64
San Antonio          45252 non-null float64
Dallas               45252 non-null float64
Houston              45251 non-null float64
Kansas City          45252 non-null float64
Minneapolis          45252 non-null float64
Saint Louis          45252 non-null float64
Chicago              45252 non-null float64
Nashville            45251 non-null float64
Indianapolis         45252 non-null float64
Atlanta     

In [63]:
wind_direction_df.iloc[:0].to_sql('wind_direction', con = engine, index = False)

### Wind Speed for different cities including LA

In [30]:
wind_speed_df = pd.read_csv('/Users/aminenhila/Desktop/Metis/Project3/historical-hourly-weather-data/wind_direction.csv')


In [31]:
wind_speed_df.head()

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01 12:00:00,,,,,,,,,,...,,,,,,,360.0,,,
1,2012-10-01 13:00:00,0.0,0.0,150.0,0.0,0.0,0.0,0.0,10.0,360.0,...,270.0,260.0,230.0,60.0,135.0,101.0,30.0,336.0,336.0,329.0
2,2012-10-01 14:00:00,6.0,4.0,147.0,2.0,0.0,0.0,8.0,9.0,360.0,...,270.0,260.0,230.0,60.0,157.0,315.0,30.0,336.0,336.0,329.0
3,2012-10-01 15:00:00,20.0,18.0,141.0,10.0,0.0,0.0,23.0,9.0,360.0,...,271.0,260.0,231.0,60.0,157.0,307.0,30.0,336.0,336.0,329.0
4,2012-10-01 16:00:00,34.0,31.0,135.0,17.0,0.0,0.0,37.0,9.0,360.0,...,272.0,260.0,233.0,60.0,157.0,294.0,30.0,336.0,336.0,329.0


In [33]:
wind_speed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
datetime             45253 non-null object
Vancouver            44458 non-null float64
Portland             45252 non-null float64
San Francisco        44459 non-null float64
Seattle              45252 non-null float64
Los Angeles          45252 non-null float64
San Diego            45252 non-null float64
Las Vegas            45248 non-null float64
Phoenix              45252 non-null float64
Albuquerque          45252 non-null float64
Denver               45252 non-null float64
San Antonio          45252 non-null float64
Dallas               45252 non-null float64
Houston              45251 non-null float64
Kansas City          45252 non-null float64
Minneapolis          45252 non-null float64
Saint Louis          45252 non-null float64
Chicago              45252 non-null float64
Nashville            45251 non-null float64
Indianapolis         45252 non-null float64
Atlanta     

In [34]:
wind_speed_df.datetime = pd.to_datetime(wind_speed_df.datetime)

In [35]:
wind_speed_df.iloc[:0].to_sql('wind_speed', con = engine, index = False)

In [36]:
wind_speed_df.to_csv('wind_speed.csv', index=False ,sep = ',')

### Demographics for LA by zip code

In [48]:
census_zip_code_df = pd.read_csv('/Users/aminenhila/Desktop/Metis/Project3/los-angeles-census-data/2010-census-populations-by-zip-code.csv')


In [49]:
census_zip_code_df.head()

Unnamed: 0,Zip Code,Total Population,Median Age,Total Males,Total Females,Total Households,Average Household Size
0,91371,1,73.5,0,1,1,1.0
1,90001,57110,26.6,28468,28642,12971,4.4
2,90002,51223,25.5,24876,26347,11731,4.36
3,90003,66266,26.3,32631,33635,15642,4.22
4,90004,62180,34.8,31302,30878,22547,2.73


In [52]:
census_zip_code_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319 entries, 0 to 318
Data columns (total 7 columns):
Zip Code                  319 non-null int64
Total Population          319 non-null int64
Median Age                319 non-null float64
Total Males               319 non-null int64
Total Females             319 non-null int64
Total Households          319 non-null int64
Average Household Size    319 non-null float64
dtypes: float64(2), int64(5)
memory usage: 17.6 KB


In [85]:
census_zip_code_df.iloc[:0].to_sql('census_zip_code', con = engine, index = False)

### Demographics for LA by council district

In [53]:
census_council_dist_df = pd.read_csv('/Users/aminenhila/Desktop/Metis/Project3/los-angeles-census-data/census-data-by-council-district.csv')


In [54]:
census_council_dist_df.head()

Unnamed: 0,Council District,Pop2010,White_pop,Black_pop,Ameri_es_pop,Asian_pop,Hawn_pi_pop,Hispanic_pop,Other_pop,Multi_pop,...,Marhh_chd,Marhh_no_c,Mhh_child,Fhh_child,Families,Avg_family_size,Housing_units,Vacant,Owner_occ,Renter_occ
0,1 - Gilbert Cedillo,245216.22,90857.0,7759.13,2991.73,43397.31,243.41,172971.55,88016.11,11951.53,...,17472.38,12582.79,3923.03,8697.64,51391.85,2.68,85219.33,6204.52,15219.58,63795.23
1,10 - Herb J. Wesson Jr.,261297.85,68458.12,67087.07,2307.88,43148.34,308.11,126872.43,68188.27,11800.07,...,17301.48,14284.08,3736.82,10927.24,57677.78,2.52,103637.46,7721.54,21764.3,74151.62
2,11 - Mike Bonin,250726.93,174020.7,13569.6,1214.43,30132.14,521.02,47243.32,18894.8,12374.24,...,17305.63,24708.3,1874.2,4632.45,55447.15,2.07,123048.13,9183.91,50753.74,63110.49
3,12 - Mitchell Englander,258715.95,158939.64,11239.45,1193.56,46815.27,374.43,70827.63,27943.83,12209.77,...,22493.92,26040.98,2137.22,5223.44,63921.59,2.72,90415.8,3683.73,60274.21,26457.86
4,13 - Mitch O'Farrell,247142.04,113391.23,9109.58,2320.07,43740.54,315.72,133038.67,66020.53,12244.39,...,15096.73,14479.35,3100.5,7569.9,49457.04,2.48,102632.78,7725.95,13638.56,81268.27


In [55]:
census_council_dist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 41 columns):
Council District        15 non-null object
Pop2010                 15 non-null float64
White_pop               15 non-null float64
Black_pop               15 non-null float64
Ameri_es_pop            15 non-null float64
Asian_pop               15 non-null float64
Hawn_pi_pop             15 non-null float64
Hispanic_pop            15 non-null float64
Other_pop               15 non-null float64
Multi_pop               15 non-null float64
Male_pop                15 non-null float64
Female_pop              15 non-null float64
Age_under5Ê             15 non-null float64
Age_5_9                 15 non-null float64
Age_10_14               15 non-null float64
Age_15_19               15 non-null float64
Age_20_24               15 non-null float64
Age_25_34               15 non-null float64
Age_35_44               15 non-null float64
Age_45_54               15 non-null float64
Age_55_64         

In [67]:
census_council_dist_df.iloc[:0].to_sql('census_council_district', con = engine, index = False)

### Demographics for LA by council neighborhood


In [57]:
census_council_neigh_df = pd.read_csv('/Users/aminenhila/Desktop/Metis/Project3/los-angeles-census-data/census-data-by-neighborhood-council.csv')


In [58]:
census_council_neigh_df.head()

Unnamed: 0,NC_Name,Total Population,White_pop,Black_pop,Ameri_es_pop,Asian_pop,Hawn_pi_pop,Hispanic_pop,Other_pop,Multi_pop,In_Poverty,Owner_occ,Renter_occ
0,ARLETA NC,34932.84,2882.67,409.67,67.64,4061.31,34.96,,37.58,245.05,34700.56,5590.27,2159.94
1,ARROYO SECO NC,21711.47,4640.35,782.76,89.51,3215.37,25.38,,66.98,263.12,21582.85,4138.14,3235.61
2,ATWATER VILLAGE NC,11385.4,3450.7,127.8,0.15,2379.08,71.85,,0.37,252.76,11345.53,1877.3,2902.89
3,BEL AIR-BEVERLY CREST NC,26789.14,22024.61,349.63,0.02,1787.79,0.01,,156.92,946.28,26623.97,8939.21,1943.18
4,BOYLE HEIGHTS NC,81900.56,1562.2,640.97,63.63,2266.59,73.99,,90.3,98.5,81144.31,4743.28,16001.77


In [59]:
census_council_neigh_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 13 columns):
NC_Name             97 non-null object
Total Population    97 non-null float64
White_pop           97 non-null float64
Black_pop           97 non-null float64
Ameri_es_pop        97 non-null float64
Asian_pop           97 non-null float64
Hawn_pi_pop         97 non-null float64
Hispanic_pop        0 non-null float64
Other_pop           97 non-null float64
Multi_pop           97 non-null float64
In_Poverty          97 non-null float64
Owner_occ           97 non-null float64
Renter_occ          97 non-null float64
dtypes: float64(12), object(1)
memory usage: 10.0+ KB


In [68]:
census_council_neigh_df.iloc[:0].to_sql('census_neighborhood_council', con = engine, index = False)

### MedianListing Home Prices by zip code 

In [68]:
median_listing_price = pd.read_csv('MedianListingPrice.csv', encoding='latin-1')

In [69]:
median_listing_price.head()

Unnamed: 0,RegionName,City,State,Metro,CountyName,SizeRank,2010-01,2010-02,2010-03,2010-04,...,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,795000.0,775000.0,799000.0,744500.0,...,1270000.0,1224500.0,1195000.0,1195000.0,1195000.0,1199000.0,1195000.0,1195000.0,1195000.0,1195000.0
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,389900.0,381900.0,369900.0,375000.0,...,449000.0,454900.0,469450.0,469000.0,466250.0,472450.0,459999.0,450000.0,450000.0,450000.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,,,,,...,1695000.0,1700000.0,1788000.0,1750000.0,1725000.0,1765000.0,1825000.0,1800000.0,1835000.0,1885000.0
3,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,287500.0,286808.0,290990.0,289900.0,...,350000.0,355924.0,355000.0,359990.0,359900.0,365000.0,365000.0,359900.0,356000.0,359000.0
4,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,489000.0,474000.0,459900.0,449500.0,...,652500.0,635000.0,622200.0,609950.0,599000.0,599000.0,625000.0,649900.0,649000.0,649000.0


In [71]:
median_listing_price_LA = median_listing_price[median_listing_price['City']=='Los Angeles']

In [76]:
median_listing_price_LA.to_csv('/Users/aminenhila/Desktop/Metis/Project3/LA_MedianListingPrice.csv', index=False ,sep = ',')


In [79]:
median_listing_price_LA.iloc[:0].to_sql('la_median_listing_price', con = engine, index = False)