In [4]:
import pandas as pd

data = pd.read_csv('data_cleaned.csv',index_col=0)
numerical_cols = [i for i in data.columns if i not in ['states','Agency Name']]
data[numerical_cols] = data[numerical_cols].applymap(lambda x: float(x))
crimes = [i for i in data.columns if i not in ['states','Agency Name','total_crimes','Population1']]
data.head()

Unnamed: 0,states,Agency Name,Population1,Intimidation,HomicideOffenses,Murder andNonnegligentManslaughter,NegligentMan-slaughter,JustifiableHomicide,HumanTraffickingOffenses,CommercialSex Acts,...,Operating/Promoting/AssistingGambling,GamblingEquipmentViolations,SportsTampering,Por-nography/ObsceneMaterial,Pros-titutionOffenses,Pros-titution,Assisting orPromotingProstitution,PurchasingProstitution,WeaponLawViolations,total_crimes
0,ALABAMA,Hoover,85163.0,52.0,539.0,201.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,7.0,15.0,15.0,0.0,0.0,45.0,7114.0
1,ARIZONA,Apache Junction,38519.0,82.0,345.0,64.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,10.0,0.0,10.0,0.0,37.0,4022.0
2,ARIZONA,Gilbert,247324.0,121.0,846.0,202.0,4.0,2.0,1.0,1.0,...,0.0,0.0,0.0,13.0,11.0,10.0,1.0,0.0,46.0,11988.0
3,ARIZONA,Yuma,93923.0,330.0,668.0,167.0,6.0,5.0,1.0,0.0,...,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,74.0,10739.0
4,ARKANSAS,Alma,5581.0,16.0,129.0,84.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,4.0,919.0


In [5]:
data['total_crimes'].sum()

5693147.0

In [4]:
df = data.head(5)

def convert_to_md(frame,name):
    cols = frame.columns
# Create a new DataFrame with just the markdown
# strings
    df2 = pd.DataFrame([['---',]*len(cols)], columns=cols)
#Create a new concatenated DataFrame
    df3 = pd.concat([df2, frame])
#Save as markdown
    df3.to_csv(name+".md", sep="|", index=False)
    
convert_to_md(df,'cleaned')

This model is good. For credit card fraud comitted by perpetrators in the USA, only about 2% of them lead to arrests. For murders, about 41% of crimes lead to arrests.

In [2]:
def model(data,fraud_crimes,corr):
    df = data.copy()
    import warnings
    import numpy as np
    warnings.filterwarnings('ignore')
    states = df['states'].unique() 
    #group all the fraud crimes together
    if len(fraud_crimes)>1:
        df['fraud_count'] = df[fraud_crimes].sum(axis=1)
        df.drop(fraud_crimes,axis=1,inplace=True)
    else:
        df['fraud_count'] = df[fraud_crimes[0]]
        df.drop(fraud_crimes,axis=1,inplace=True)
        
        
    combined_frame = pd.DataFrame()
    
    #loop over each state
    for state in states:
        dummy_frame = df[df['states']==state]
        #if the number of towns in the state is greater than the total number of crimes in the dataset (56),
        #use the correlation coefficients of each crime for only that state
        if len(dummy_frame)>=56:
            correlations = pd.DataFrame(dummy_frame.corr()['fraud_count']).reset_index()
            correlations['fraud_count'] = correlations['fraud_count'].apply(lambda x: x if x>=corr else 0)
            correlations['crime'] = correlations['index'].apply(lambda x: 1 if x in crimes+['fraud'] else 0)
            correlations = correlations[correlations['crime']==1].drop('crime',axis=1)
            for i in correlations.index:
                factor = correlations.loc[i]['fraud_count']
                feature = correlations.loc[i]['index']
                if feature not in ['total_crimes','fraud_count']:
                    dummy_frame[feature] = dummy_frame[feature].apply(lambda x: factor*x)
    
        elif len(dummy_frame)<56:
        #if the state has less than 56, use the correlation coefficients for the entire country
        #to calculate the weights.
            correlations = pd.DataFrame(df.corr()['fraud_count']).reset_index()
            correlations['fraud_count'] = correlations['fraud_count'].apply(lambda x: x if x>=corr else 0)
            correlations['crime'] = correlations['index'].apply(lambda x: 1 if x in crimes+['fraud'] else 0)
            correlations = correlations[correlations['crime']==1].drop('crime',axis=1)
        
        for i in correlations.index:
            factor = correlations.loc[i]['fraud_count']
            feature = correlations.loc[i]['index']
            if feature not in ['total_crimes','fraud_count']:
                dummy_frame[feature] = dummy_frame[feature].apply(lambda x: factor*x)
        combined_frame = pd.concat([combined_frame,dummy_frame],axis=0)
    
    model_groupby = combined_frame.groupby('states').sum().reset_index()
    model_groupby_crimes = [i for i in model_groupby.columns if i not in ['states','Population1','total_crimes']]
    model_groupby['est.Frauds'] = model_groupby[model_groupby_crimes].sum(axis=1)
    model_groupby['%est.Caught'] = model_groupby['fraud_count'].div(model_groupby['est.Frauds'],axis=0).multiply(100)
    model_groupby = model_groupby[['states','Population1','total_crimes','fraud_count','est.Frauds','%est.Caught']]
    model_groupby['fraud_prob'] = model_groupby['est.Frauds'].div(model_groupby['Population1'],axis=0)
    model_groupby.sort_values(by='fraud_prob',ascending=True,inplace=True)
    
    model_groupby = model_groupby[['states','fraud_prob','Population1','total_crimes','fraud_count','est.Frauds','%est.Caught']]
    model_groupby = model_groupby.reset_index().drop('index',axis=1)
    
    
    nation_wide_arrests = model_groupby['fraud_count'].sum()
    nation_wide_estimates = model_groupby['est.Frauds'].sum()
    percent_caught = round(100*nation_wide_arrests/nation_wide_estimates,2)
    
    print('crimes analzed: ',fraud_crimes)
    print('Nation wide % of crimes where arrests are made',percent_caught,'%')
    print('Nation wide arrests :',round(nation_wide_arrests))
    print('Nation wide estimate number of crimes: ',round(nation_wide_estimates))
    return model_groupby

In [8]:
fraud_crimes = ['FraudOffenses','FalsePretenses/Swindle/ConfidenceGame','CreditCard/AutomatedTellerMachineFraud', 
                'Imper-sonation', 'WelfareFraud', 'WireFraud','Counter-feiting/Forgery']

In [9]:
fraud_results = model(data,fraud_crimes,0.5)
fraud_results

crimes analzed:  ['FraudOffenses', 'FalsePretenses/Swindle/ConfidenceGame', 'CreditCard/AutomatedTellerMachineFraud', 'Imper-sonation', 'WelfareFraud', 'WireFraud', 'Counter-feiting/Forgery']
Nation wide % of crimes where arrests are made 15.78 %
Nation wide arrests : 796374.0
Nation wide estimate number of crimes:  5047316.0


Unnamed: 0,states,fraud_prob,Population1,total_crimes,fraud_count,est.Frauds,%est.Caught
0,CONNECTICUT,0.034646,2131967.0,109857.0,18701.0,73864.686267,25.317917
1,MASSACHUSETTS,0.045883,5589329.0,307121.0,49528.0,256453.583533,19.312657
2,PENNSYLVANIA,0.046069,192011.0,9866.0,1915.0,8845.742894,21.648832
3,VERMONT,0.046339,356298.0,18126.0,2127.0,16510.534459,12.882684
4,RHODE ISLAND,0.05682,1074282.0,66958.0,10899.0,61040.632151,17.85532
5,ARIZONA,0.063064,379766.0,26749.0,4220.0,23949.592711,17.620341
6,NEW HAMPSHIRE,0.064658,1180502.0,85331.0,13998.0,76328.353214,18.339188
7,OKLAHOMA,0.066707,793311.0,65236.0,10411.0,52919.724195,19.673194
8,MICHIGAN,0.068435,6691114.0,530395.0,95180.0,457904.873388,20.785977
9,IDAHO,0.074196,980281.0,78444.0,9982.0,72732.657321,13.724234


In [10]:
fraud_results.to_csv('results.csv')

In [11]:
fraud_crimes1 = ['CreditCard/AutomatedTellerMachineFraud']
model_results = model(data,fraud_crimes1,0.5)
model_results

crimes analzed:  ['CreditCard/AutomatedTellerMachineFraud']
Nation wide % of crimes where arrests are made 2.27 %
Nation wide arrests : 87951.0
Nation wide estimate number of crimes:  3878504.0


Unnamed: 0,states,fraud_prob,Population1,total_crimes,fraud_count,est.Frauds,%est.Caught
0,CONNECTICUT,0.018265,2131967.0,109857.0,1663.0,38940.952957,4.270568
1,PENNSYLVANIA,0.029481,192011.0,9866.0,334.0,5660.581933,5.900453
2,VERMONT,0.031146,356298.0,18126.0,368.0,11097.336408,3.316111
3,MASSACHUSETTS,0.035658,5589329.0,307121.0,6741.0,199306.864619,3.382222
4,RHODE ISLAND,0.039458,1074282.0,66958.0,2341.0,42389.060392,5.522651
5,ARIZONA,0.040326,379766.0,26749.0,505.0,15314.391716,3.297552
6,OHIO,0.046552,4898884.0,477807.0,2148.0,228050.726701,0.941896
7,NORTH DAKOTA,0.048194,519823.0,44321.0,715.0,25052.160775,2.854045
8,ALABAMA,0.048597,85163.0,7114.0,43.0,4138.65892,1.038984
9,NEBRASKA,0.049553,411784.0,35529.0,503.0,20404.982641,2.465084
