# RICE-VIRT-DATA-PT-05-2022-U-B-MW Final Project

## Code Summary
- **Purpose  :** Evaluation of Crime Trends Machine Learning Resampling Algorithms 
- **Created  :** 2022 Sept 29 22:25:12 UTC (Meghan E. Hull)

## Dependencies

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
import re
from sqlalchemy import create_engine
import psycopg2

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Version Check

In [4]:
!python --version

Python 3.7.13


In [5]:
!conda list | findstr numpy

numpy                     1.21.5           py37h7a0a035_1  
numpy-base                1.21.5           py37hca35cd5_1  
numpydoc                  1.2                pyhd3eb1b0_0  


In [6]:
!conda list | findstr pandas

pandas                    1.3.5            py37h6214cd6_0  


In [7]:
!conda list | findstr scipy

scipy                     1.7.3            py37h0a974cb_0  


In [8]:
!conda list | findstr scikit-learn

scikit-learn              1.0.2            py37hf11a4ad_1  
scikit-learn-intelex      2021.5.0         py37haa95532_0  


In [9]:
!conda list | findstr imbalanced-learn

imbalanced-learn          0.9.0                    pypi_0    pypi


## Institate a Linear Regression Model

In [7]:
model = LinearRegression()

## Report Tables

In [8]:
summary_df = pd.DataFrame(columns=['Balanced Accuracy Score', 
                                   'High Risk Precision Score', 
                                   'Low Risk Precision Score', 
                                   'High Risk Recall Score', 
                                   'Low Risk Recall Score',
                                   'High Risk F1 Score', 
                                   'Low Risk F1 Score'])

# 1. Import & Prep Client Data

## 1.1 Import & Initial Cleaning of Data

In [9]:
# Import database password
from config import db_password

# Define connection string
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Crime_AnalysisDB"

In [10]:
 # Connect to DB engine
engine = create_engine(db_string)

In [21]:
# Import harris_county data
harris_county_df = pd.read_sql_table('harris_county',con=engine)
outter_hou_df= pd.read_sql_table('outter_hou',con=engine)

In [30]:
harris_county_df=harris_county_df.drop('index', axis=1)

In [31]:
harris_county_df.columns

Index(['County', 'Year', 'Agency_Count', 'Murder', 'Rape', 'Assault',
       'Burglary', 'Larceny', 'Auto_Theft', 'Violent_Offenses',
       'NonViolent_Offenses', 'Total_Crime', 'Population'],
      dtype='object')

In [32]:
outter_hou_df=outter_hou_df.drop('Unnamed: 0', axis=1)
outter_hou_df=outter_hou_df.drop('index', axis=1)

In [33]:
outter_hou_df.columns

Index(['County', 'Year', 'Agency_Count', 'Murder', 'Rape', 'Assault',
       'Burglary', 'Larceny', 'Auto_Theft', 'Violent_Offenses',
       'NonViolent_Offenses', 'Total_Crime', 'Population'],
      dtype='object')

## 1.2 Split into Training & Testing


In [47]:
# Create target
y_Harris = pd.DataFrame(harris_county_df["Violent_Offenses"])

# Create features
X_Harris = pd.DataFrame(harris_county_df[['County', 'Year', 'Agency_Count','Population']])


In [49]:
X_Harris.head()

Unnamed: 0,County,Year,Agency_Count,Population
0,Harris County,2015,42,4564664
1,Harris County,2016,44,4646508
2,Harris County,2017,44,4702468
3,Harris County,2018,45,4753437
4,Harris County,2019,45,4776485


In [51]:
y_Harris.head(10)

Unnamed: 0,Violent_Offenses
0,18684
1,21126
2,23222
3,23709
4,23021
5,29232


## 2. Learning Stage (Curve-fitting)

In [None]:
# Learning Stage (Curve-fitting)