In [36]:
import pandas as pd

data = pd.read_csv('data_cleaned.csv',index_col=0)
numerical_cols = [i for i in data.columns if i not in ['states','City']]
data[numerical_cols] = data[numerical_cols].applymap(lambda x: float(x))
crimes = [i for i in data.columns if i not in ['states','City','total_crimes','Population']]
data.head()

Unnamed: 0,states,City,Population,total_crimes,Aggravated Assault,Simple Assault,Intimidation,Murder and Nonnegligent Manslaughter,Negligent Man- slaughter,Justifiable Homicide,...,Drug Equipment Violations,Betting/ Wagering,Operating/ Promoting/ Assisting Gambling,Gambling Equipment Violations,Sports Tampering,Por- nography/ Obscene Material,Pros- titution,Assisting or Promoting Prostitution,Purchasing Prostitution,Weapon Law Violations
0,ALABAMA,Hoover,85163.0,4627.0,52.0,539.0,201.0,3.0,0.0,0.0,...,148.0,0.0,0.0,0.0,0.0,7.0,15.0,0.0,0.0,45.0
1,ARIZONA,Apache Junction,38519.0,2964.0,82.0,345.0,64.0,0.0,0.0,0.0,...,166.0,0.0,0.0,0.0,0.0,4.0,0.0,10.0,0.0,37.0
2,ARIZONA,Gilbert,247324.0,8676.0,121.0,846.0,202.0,2.0,1.0,1.0,...,845.0,0.0,0.0,0.0,0.0,13.0,10.0,1.0,0.0,46.0
3,ARIZONA,Yuma,93923.0,7985.0,330.0,668.0,167.0,5.0,1.0,0.0,...,573.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,74.0
4,ARKANSAS,Alma,5581.0,661.0,16.0,129.0,84.0,0.0,0.0,0.0,...,6.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0


In [17]:
data['total_crimes'].sum()

4106936.0

In [37]:
df = data.head(5)

def convert_to_md(frame,name):
    cols = frame.columns
# Create a new DataFrame with just the markdown
# strings
    df2 = pd.DataFrame([['---',]*len(cols)], columns=cols)
#Create a new concatenated DataFrame
    df3 = pd.concat([df2, frame])
#Save as markdown
    df3.to_csv(name+".md", sep="|", index=False)
    
convert_to_md(df,'cleaned')

In [25]:
def model(data,fraud_crimes,corr):
    df = data.copy()
    import warnings
    import numpy as np
    warnings.filterwarnings('ignore')
    states = df['states'].unique() 
    #group all the fraud crimes together
    if len(fraud_crimes)>1:
        df['fraud_count'] = df[fraud_crimes].sum(axis=1)
        df.drop(fraud_crimes,axis=1,inplace=True)
    else:
        df['fraud_count'] = df[fraud_crimes[0]]
        df.drop(fraud_crimes,axis=1,inplace=True)
        
        
    combined_frame = pd.DataFrame()
    
    #loop over each state
    for state in states:
        dummy_frame = df[df['states']==state]
        #if the number of towns in the state is greater than the total number of crimes in the dataset (56),
        #use the correlation coefficients of each crime for only that state
        if len(dummy_frame)>=56:
            correlations = pd.DataFrame(dummy_frame.corr()['fraud_count']).reset_index()
            correlations['fraud_count'] = correlations['fraud_count'].apply(lambda x: x if x>=corr else 0)
            correlations['crime'] = correlations['index'].apply(lambda x: 1 if x in crimes+['fraud'] else 0)
            correlations = correlations[correlations['crime']==1].drop('crime',axis=1)
            for i in correlations.index:
                factor = correlations.loc[i]['fraud_count']
                feature = correlations.loc[i]['index']
                if feature not in ['total_crimes','fraud_count']:
                    dummy_frame[feature] = dummy_frame[feature].apply(lambda x: factor*x)
    
        elif len(dummy_frame)<56:
        #if the state has less than 56, use the correlation coefficients for the entire country
        #to calculate the weights.
            correlations = pd.DataFrame(df.corr()['fraud_count']).reset_index()
            correlations['fraud_count'] = correlations['fraud_count'].apply(lambda x: x if x>=corr else 0)
            correlations['crime'] = correlations['index'].apply(lambda x: 1 if x in crimes+['fraud'] else 0)
            correlations = correlations[correlations['crime']==1].drop('crime',axis=1)
        
        for i in correlations.index:
            factor = correlations.loc[i]['fraud_count']
            feature = correlations.loc[i]['index']
            if feature not in ['total_crimes','fraud_count']:
                dummy_frame[feature] = dummy_frame[feature].apply(lambda x: factor*x)
        combined_frame = pd.concat([combined_frame,dummy_frame],axis=0)
    
    model_groupby = combined_frame.groupby('states').sum().reset_index()
    model_groupby_crimes = [i for i in model_groupby.columns if i not in ['states','Population','total_crimes']]
    model_groupby['est.Frauds'] = model_groupby[model_groupby_crimes].sum(axis=1)
    model_groupby['%est.Caught'] = model_groupby['fraud_count'].div(model_groupby['est.Frauds'],axis=0).multiply(100)
    model_groupby = model_groupby[['states','Population','total_crimes','fraud_count','est.Frauds','%est.Caught']]
    model_groupby['fraud_prob'] = model_groupby['est.Frauds'].div(model_groupby['Population'],axis=0)
    model_groupby.sort_values(by='fraud_prob',ascending=True,inplace=True)
    
    model_groupby = model_groupby[['states','fraud_prob','Population','total_crimes','fraud_count','est.Frauds','%est.Caught']]
    model_groupby = model_groupby.reset_index().drop('index',axis=1)
    
    
    nation_wide_arrests = model_groupby['fraud_count'].sum()
    nation_wide_estimates = model_groupby['est.Frauds'].sum()
    percent_caught = round(100*nation_wide_arrests/nation_wide_estimates,2)
    
    print('crimes analzed: ',fraud_crimes)
    print('Nation wide % of crimes where arrests are made',percent_caught,'%')
    print('Nation wide arrests :',round(nation_wide_arrests))
    print('Nation wide estimate number of crimes: ',round(nation_wide_estimates))
    return model_groupby

In [23]:
fraud_crimes = ['Credit Card/ Automated Teller Machine Fraud', 'Imper- sonation',
       'Welfare Fraud', 'Wire Fraud','False Pretenses/ Swindle/ Confidence Game','Counter- feiting/ Forgery']

In [26]:
fraud_results = model(data,fraud_crimes,0.5)
fraud_results

crimes analzed:  ['Credit Card/ Automated Teller Machine Fraud', 'Imper- sonation', 'Welfare Fraud', 'Wire Fraud', 'False Pretenses/ Swindle/ Confidence Game', 'Counter- feiting/ Forgery']
Nation wide % of crimes where arrests are made 9.2 %
Nation wide arrests : 302783.0
Nation wide estimate number of crimes:  3291678.0


Unnamed: 0,states,fraud_prob,Population,total_crimes,fraud_count,est.Frauds,%est.Caught
0,CONNECTICUT,0.018755,2131967.0,75676.0,8410.0,39984.732828,21.033028
1,VERMONT,0.028646,356298.0,12130.0,659.0,10206.526964,6.456653
2,MASSACHUSETTS,0.029728,5589329.0,220709.0,18988.0,166157.165836,11.427735
3,PENNSYLVANIA,0.029747,192011.0,6830.0,901.0,5711.734907,15.774542
4,RHODE ISLAND,0.0378,1074282.0,48100.0,3988.0,40608.269721,9.82066
5,WISCONSIN,0.041495,1692713.0,112536.0,6882.0,70238.724227,9.798014
6,COLORADO,0.042639,3849768.0,245743.0,20101.0,164151.524847,12.245393
7,ARIZONA,0.042895,379766.0,19625.0,1484.0,16290.180241,9.109783
8,NEW HAMPSHIRE,0.044739,1180502.0,61688.0,6252.0,52814.510091,11.837656
9,OKLAHOMA,0.044845,793311.0,48728.0,3541.0,35575.804513,9.953394


In [35]:
top5 = fraud_results[['states','fraud_prob','%est.Caught']].head(5)
convert_to_md(top5,'top5')

In [27]:
fraud_results.to_csv('results.csv')

In [29]:
fraud_crimes1 = ['Credit Card/ Automated Teller Machine Fraud']
model_results = model(data,fraud_crimes1,0.5)

crimes analzed:  ['Credit Card/ Automated Teller Machine Fraud']
Nation wide % of crimes where arrests are made 2.71 %
Nation wide arrests : 69914.0
Nation wide estimate number of crimes:  2576097.0
