# **Deep Dive Project - Group 10**
### Analyzing and Predicting California's Unemployment Rate Using Deep Learning

#### Importing Libraries

In [None]:
import numpy as np # numpy for computation
import pandas as pd  # pandas library for computation

#### Loading Dataset

In [None]:
def getfile(location_pair,indict={}): #tries to get local version and then defaults to google drive version
    (loc,gdrive)=location_pair
    try:
        out=pd.read_csv(loc,*indict)
    except FileNotFoundError:
        print("local file not found; accessing Google Drive")
        loc = 'https://drive.google.com/uc?export=download&id='+gdrive.split('/')[-2]
        out=pd.read_csv(loc,*indict)
    return out

In [None]:
fname=("Local Area Unemployment Statistics (LAUS).csv","https://drive.google.com/file/d/1PhDuxwfj42OqDN9XnLx56UbXgY5SNQTC/view?usp=sharing")
data = getfile(fname)
data.info()
data.head(5)

local file not found; accessing Google Drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203072 entries, 0 to 203071
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   _id                       203072 non-null  int64  
 1   Area Name                 203072 non-null  object 
 2   Area Type                 203072 non-null  object 
 3   Year                      203072 non-null  int64  
 4   Month                     203072 non-null  object 
 5   Date_Numeric              203072 non-null  object 
 6   Seasonally Adjusted(Y/N)  203072 non-null  object 
 7   Status                    203072 non-null  object 
 8   Labor Force               203072 non-null  int64  
 9   Employment                203072 non-null  int64  
 10  Unemployment              203072 non-null  int64  
 11  Unemployment Rate         203072 non-null  float64
 12  Benchmark                 203072 non-null  int64  
dtyp

Unnamed: 0,_id,Area Name,Area Type,Year,Month,Date_Numeric,Seasonally Adjusted(Y/N),Status,Labor Force,Employment,Unemployment,Unemployment Rate,Benchmark
0,1,California,State,1976,January,01/1976,N,Final,9672362,8668016,1004346,10.4,2020
1,2,California,State,1976,January,01/1976,Y,Final,9774280,8875685,898595,9.2,2020
2,3,California,State,1976,February,02/1976,N,Final,9684440,8704564,979876,10.1,2020
3,4,California,State,1976,February,02/1976,Y,Final,9768885,8871553,897332,9.2,2020
4,5,California,State,1976,March,03/1976,N,Final,9689626,8776344,913282,9.4,2020


In [None]:
data['Date_Numeric'] = pd.to_datetime(data['Date_Numeric'], format='%m/%Y')
data.drop(['Year', 'Month'], axis=1, inplace=True)
data['Benchmark'] = pd.to_datetime(data['Benchmark'], format='%Y')
data.info()
data.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203072 entries, 0 to 203071
Data columns (total 11 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   _id                       203072 non-null  int64         
 1   Area Name                 203072 non-null  object        
 2   Area Type                 203072 non-null  object        
 3   Date_Numeric              203072 non-null  datetime64[ns]
 4   Seasonally Adjusted(Y/N)  203072 non-null  object        
 5   Status                    203072 non-null  object        
 6   Labor Force               203072 non-null  int64         
 7   Employment                203072 non-null  int64         
 8   Unemployment              203072 non-null  int64         
 9   Unemployment Rate         203072 non-null  float64       
 10  Benchmark                 203072 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int64(4), object(4)
memory usag

Unnamed: 0,_id,Area Name,Area Type,Date_Numeric,Seasonally Adjusted(Y/N),Status,Labor Force,Employment,Unemployment,Unemployment Rate,Benchmark
0,1,California,State,1976-01-01,N,Final,9672362,8668016,1004346,10.4,2020-01-01
1,2,California,State,1976-01-01,Y,Final,9774280,8875685,898595,9.2,2020-01-01
2,3,California,State,1976-02-01,N,Final,9684440,8704564,979876,10.1,2020-01-01
3,4,California,State,1976-02-01,Y,Final,9768885,8871553,897332,9.2,2020-01-01
4,5,California,State,1976-03-01,N,Final,9689626,8776344,913282,9.4,2020-01-01


#### Pickle the Datasets

In [None]:
# Create and pickle the debugging dataset
debugging_data = data.sample(frac=0.01, random_state=42)   # 1% of the original data
debugging_data.to_pickle('debugging_data.pkl')

# Create and pickle the working dataset
working_data = data.sample(frac=0.3, random_state=42)   # 30% of the original data
working_data.to_pickle('working_data.pkl')

# Pickle the original dataset
data.to_pickle('original_data.pkl')

#### Loading Pickled Datasets

```
# Load the pickled datasets
debugging_data = pd.read_pickle('debugging_data.pkl')
working_data = pd.read_pickle('working_data.pkl')
original_data = pd.read_pickle('original_data.pkl')
```

