### Imports

In [1]:
import pandas as pd
import csv
import matplotlib as plt
from sklearn.linear_model import LinearRegression

### Importing csv file

In [2]:
allstate_df = pd.read_csv("allstatedata.csv", index_col="fips")
allstate_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 1 to 56
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   state                    50 non-null     object 
 1   state_exp                50 non-null     object 
 2   dataYear                 50 non-null     int64  
 3   reIncarcerationRate      50 non-null     object 
 4   reIncarcerationRate2018  42 non-null     float64
 5    Employment              50 non-null     float64
 6    Homeownership           50 non-null     float64
 7    Livable Income          50 non-null     float64
 8    Postsecondary Edu       50 non-null     float64
 9    Youth                   50 non-null     float64
 10   Affordable Housing      50 non-null     float64
dtypes: float64(7), int64(1), object(3)
memory usage: 4.7+ KB


### Removing Null Values

In [3]:
nonnull_allstate_df = allstate_df.dropna()
nonnull_allstate_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42 entries, 1 to 56
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   state                    42 non-null     object 
 1   state_exp                42 non-null     object 
 2   dataYear                 42 non-null     int64  
 3   reIncarcerationRate      42 non-null     object 
 4   reIncarcerationRate2018  42 non-null     float64
 5    Employment              42 non-null     float64
 6    Homeownership           42 non-null     float64
 7    Livable Income          42 non-null     float64
 8    Postsecondary Edu       42 non-null     float64
 9    Youth                   42 non-null     float64
 10   Affordable Housing      42 non-null     float64
dtypes: float64(7), int64(1), object(3)
memory usage: 3.9+ KB


### Converting reIncarcerationRate to numeric

In [5]:
nonnull_allstate_df[["reIncarcerationRate"]] = nonnull_allstate_df[["reIncarcerationRate"]].apply(pd.to_numeric)
nonnull_allstate_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42 entries, 1 to 56
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   state                    42 non-null     object 
 1   state_exp                42 non-null     object 
 2   dataYear                 42 non-null     int64  
 3   reIncarcerationRate      42 non-null     float64
 4   reIncarcerationRate2018  42 non-null     float64
 5    Employment              42 non-null     float64
 6    Homeownership           42 non-null     float64
 7    Livable Income          42 non-null     float64
 8    Postsecondary Edu       42 non-null     float64
 9    Youth                   42 non-null     float64
 10   Affordable Housing      42 non-null     float64
dtypes: float64(8), int64(1), object(2)
memory usage: 3.9+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonnull_allstate_df[["reIncarcerationRate"]] = nonnull_allstate_df[["reIncarcerationRate"]].apply(pd.to_numeric)


### Getting description of dataframe

In [6]:
nonnull_allstate_df.describe()

Unnamed: 0,dataYear,reIncarcerationRate,reIncarcerationRate2018,Employment,Homeownership,Livable Income,Postsecondary Edu,Youth,Affordable Housing
count,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0
mean,2021.0,36.695238,36.635714,0.938595,0.686667,0.629452,0.600476,0.87881,0.722238
std,0.0,9.479167,10.504629,0.013427,0.040256,0.061451,0.052237,0.027369,0.045393
min,2021.0,21.0,21.0,0.914,0.547,0.504,0.454,0.825,0.613
25%,2021.0,29.75,28.9,0.928,0.6725,0.5805,0.571,0.85675,0.69625
50%,2021.0,36.55,34.05,0.9375,0.692,0.635,0.608,0.8795,0.7255
75%,2021.0,42.625,42.925,0.949,0.71125,0.67725,0.6425,0.89775,0.75175
max,2021.0,61.6,64.5,0.973,0.744,0.73,0.696,0.925,0.805


### Renaming Column and dropping dataYear column 

In [10]:
nonnull_allstate_df = nonnull_allstate_df.rename(columns = {"reIncarcerationRate" : "reIncarcerationRate2021"})
nonnull_allstate_df = nonnull_allstate_df.drop(["dataYear"], axis=1)
nonnull_allstate_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42 entries, 1 to 56
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   state                    42 non-null     object 
 1   state_exp                42 non-null     object 
 2   reIncarcerationRate2021  42 non-null     float64
 3   reIncarcerationRate2018  42 non-null     float64
 4    Employment              42 non-null     float64
 5    Homeownership           42 non-null     float64
 6    Livable Income          42 non-null     float64
 7    Postsecondary Edu       42 non-null     float64
 8    Youth                   42 non-null     float64
 9    Affordable Housing      42 non-null     float64
dtypes: float64(8), object(2)
memory usage: 3.6+ KB


### Testing for Correlation

In [11]:
nonnull_allstate_df.corr(numeric_only=True)

Unnamed: 0,reIncarcerationRate2021,reIncarcerationRate2018,Employment,Homeownership,Livable Income,Postsecondary Edu,Youth,Affordable Housing
reIncarcerationRate2021,1.0,0.902558,0.036815,0.039509,0.205198,0.066236,-0.017029,-0.012428
reIncarcerationRate2018,0.902558,1.0,0.062944,-0.009909,0.16515,0.062627,-0.069692,-0.032094
Employment,0.036815,0.062944,1.0,0.37115,0.461972,0.486552,0.766202,0.453664
Homeownership,0.039509,-0.009909,0.37115,1.0,-0.051059,-0.092202,0.17529,0.63959
Livable Income,0.205198,0.16515,0.461972,-0.051059,1.0,0.75749,0.750938,-0.250503
Postsecondary Edu,0.066236,0.062627,0.486552,-0.092202,0.75749,1.0,0.690099,-0.198169
Youth,-0.017029,-0.069692,0.766202,0.17529,0.750938,0.690099,1.0,0.027738
Affordable Housing,-0.012428,-0.032094,0.453664,0.63959,-0.250503,-0.198169,0.027738,1.0


### Exporting the cleaned csv file  

In [12]:
nonnull_allstate_df.to_csv("clean_allstatedata.csv", encoding = "utf-8")