In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

In [23]:
df = pd.read_csv("dataset.csv", low_memory=False)

# The data type of perpetrator's age was not seen as int initially, so there was the need to treat those values and 
# transform them to NaN (there is only one row with an NA value in 'Perpetrator Age')
df['Perpetrator Age'] = pd.to_numeric(df['Perpetrator Age'], errors='coerce').astype('Int64')

In [24]:
df.head()

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,Crime Solved,Victim Sex,Victim Age,Victim Race,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Relationship,Weapon,Record Source
0,1,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,Yes,Male,14,Native American/Alaska Native,Male,15,Native American/Alaska Native,Acquaintance,Blunt Object,FBI
1,2,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,Yes,Male,43,White,Male,42,White,Acquaintance,Strangulation,FBI
2,3,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,No,Female,30,Native American/Alaska Native,Unknown,0,Unknown,Unknown,Unknown,FBI
3,4,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,Yes,Male,43,White,Male,42,White,Acquaintance,Strangulation,FBI
4,5,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,No,Female,30,Native American/Alaska Native,Unknown,0,Unknown,Unknown,Unknown,FBI


In [25]:
df.describe()

Unnamed: 0,Record ID,Year,Incident,Victim Age,Perpetrator Age
count,638454.0,638454.0,638454.0,638454.0,638453.0
mean,319227.5,1995.801102,22.967924,35.033512,20.322697
std,184305.93872,9.927693,92.149821,41.628306,17.886842
min,1.0,1980.0,0.0,0.0,0.0
25%,159614.25,1987.0,1.0,22.0,0.0
50%,319227.5,1995.0,2.0,30.0,21.0
75%,478840.75,2004.0,10.0,42.0,31.0
max,638454.0,2014.0,999.0,998.0,99.0


In [30]:
unique_values = df['State'].value_counts().
unique_values

California              99783
Texas                   62095
New York                49268
Florida                 37164
Michigan                28448
Illinois                25871
Pennsylvania            24236
Georgia                 21088
North Carolina          20390
Louisiana               19629
Ohio                    19158
Maryland                17312
Virginia                15520
Tennessee               14930
Missouri                14832
New Jersey              14132
Arizona                 12871
South Carolina          11698
Indiana                 11463
Alabama                 11376
Oklahoma                 8809
Washington               7815
District of Columbia     7115
Arkansas                 6947
Colorado                 6593
Kentucky                 6554
Mississippi              6546
Wisconsin                6191
Massachusetts            6036
Nevada                   5553
Connecticut              4896
New Mexico               4272
Oregon                   4217
Minnesota 

## Treating data

For The initial treating of this dataset, I will get rid of columns that 

In [26]:
#number of lines with NA values count
nan_counts = df.isna().sum()
nan_counts

Record ID           0
Agency Code         0
Agency Name         0
Agency Type         0
City                0
State               0
Year                0
Month               0
Incident            0
Crime Type          0
Crime Solved        0
Victim Sex          0
Victim Age          0
Victim Race         0
Perpetrator Sex     0
Perpetrator Age     1
Perpetrator Race    0
Relationship        0
Weapon              0
Record Source       0
dtype: int64

In [5]:
df.head()

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,Crime Solved,Victim Sex,Victim Age,Victim Race,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Relationship,Weapon,Record Source
0,1,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,Yes,Male,14,Native American/Alaska Native,Male,15,Native American/Alaska Native,Acquaintance,Blunt Object,FBI
1,2,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,Yes,Male,43,White,Male,42,White,Acquaintance,Strangulation,FBI
2,3,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,No,Female,30,Native American/Alaska Native,Unknown,0,Unknown,Unknown,Unknown,FBI
3,4,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,Yes,Male,43,White,Male,42,White,Acquaintance,Strangulation,FBI
4,5,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,No,Female,30,Native American/Alaska Native,Unknown,0,Unknown,Unknown,Unknown,FBI


## Tasks

a) Identify which variables are categorical, discrete and continuous in the chosen data set and show
using some visualization or plot. Explore whether there are missing values for any of the variables.

b) Calculate the statistical parameters (mean, median, minimum, maximum, and standard deviation)
for each of the numerical variables.

c) Apply Min-Max Normalization, Z-score Standardization and Robust scalar on the numerical data
variables.

d) Line, Scatter and Heatmaps can be used to show the correlation between the features of the
dataset.

e) Graphics and descriptive understanding should be provided along with Data Exploratory analysis
(EDA). Identify subgroups of features that can explore some interesting facts.

f) Apply dummy encoding to categorical variables (at least one variable used from the data set) and
discuss the benefits of dummy encoding to understand the categorical data.

g) Apply PCA with your chosen number of components. Write up a short profile of the first few
components extracted based on your understanding.

h) What is the purpose of dimensionality reduction? Explore the situations where you can gain the
benefit of dimensionality reduction for data analysis.

-----------------

## a) Identifying variables, showing visualisations and acknowledging the existence of missing values 

### Categorical:


### Discrete:

### Continuous:

## b) Calculate the Statistical Parameters for the numerical values

That can be done in python with the .describe() function, but I also did a breakdown below for a beter visualisation of the data that I considered interesting to analyse (Victim Age and Perpetrator Age).

In [27]:
df.describe()

Unnamed: 0,Record ID,Year,Incident,Victim Age,Perpetrator Age
count,638454.0,638454.0,638454.0,638454.0,638453.0
mean,319227.5,1995.801102,22.967924,35.033512,20.322697
std,184305.93872,9.927693,92.149821,41.628306,17.886842
min,1.0,1980.0,0.0,0.0,0.0
25%,159614.25,1987.0,1.0,22.0,0.0
50%,319227.5,1995.0,2.0,30.0,21.0
75%,478840.75,2004.0,10.0,42.0,31.0
max,638454.0,2014.0,999.0,998.0,99.0


## Mean Calculation

In [6]:
victim_age_mean = df['Victim Age'].mean()
victim_age_mean = round(victim_age_mean, 2)
perpetrator_age_mean = df['Perpetrator Age'].mean()
perpetrator_age_mean = round(perpetrator_age_mean, 2)

## Mean Values

In [7]:
print("The mean for the Victim Age values is: ", victim_age_mean)
print("The mean for the Year values is: ", perpetrator_age_mean)

The mean for the Victim Age values is:  35.03
The mean for the Year values is:  20.32


## Median calculation

In [8]:
victim_age_median = df['Victim Age'].median()
perpetrator_age_median = df['Perpetrator Age'].median()

## Median Values

In [9]:
print("The median for the Victim Age values is: ", victim_age_median)
print("The median for the Year values is: ", perpetrator_age_median)

The median for the Victim Age values is:  30.0
The median for the Year values is:  21.0


## Minimum and Maximum Values Calculation

In [10]:
minimum_year = df['Year'].min()
maximum_year = df['Year'].max()

minimum_victim_age = df['Victim Age'].min()
maximum_victim_age = df['Victim Age'].max()

minimum_perpetrator_age = df['Perpetrator Age'].min()
maximum_perpetrator_age = df['Perpetrator Age'].max()

## Minimum and Maximum Values

In [11]:
print("This dataset starts in ", minimum_year, "and finishes at ", maximum_year)
print("The youngest victims were ", minimum_victim_age, " years old, and the oldest", maximum_victim_age)
print("The youngest perpetrators were", minimum_perpetrator_age, "and the oldest were", maximum_perpetrator_age)

This dataset starts in  1980 and finishes at  2014
The youngest victims were  0  years old, and the oldest 998
The youngest perpetrators were 0 and the oldest were 99


## Standard Deviation


In [12]:
victim_age_std = df['Victim Age'].std()
perpetrator_age_std = df['Perpetrator Age'].std()

In [13]:
print("The Victim Age's Standard Deviation is: ", victim_age_std)
print("The Perpetrator Age's Standard Deviation is: ", perpetrator_age_std)

The Victim Age's Standard Deviation is:  41.628336322167975
The Perpetrator Age's Standard Deviation is:  17.88684181199938


## Min-Max Normalization

For this calculation we have the following formula:

X* = X - min(X)/ max(X) - min(X)


In [14]:
df['Normalized Victim Age'] = (df['Victim Age'] - minimum_victim_age) / (maximum_victim_age - minimum_victim_age)
df['Normalized Perpetrator Age'] = (df['Perpetrator Age'] - minimum_perpetrator_age) / (maximum_perpetrator_age - minimum_perpetrator_age)

In [15]:
df.head()

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,...,Victim Age,Victim Race,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Relationship,Weapon,Record Source,Normalized Victim Age,Normalized Perpetrator Age
0,1,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,...,14,Native American/Alaska Native,Male,15,Native American/Alaska Native,Acquaintance,Blunt Object,FBI,0.014028,0.151515
1,2,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,...,43,White,Male,42,White,Acquaintance,Strangulation,FBI,0.043086,0.424242
2,3,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,...,30,Native American/Alaska Native,Unknown,0,Unknown,Unknown,Unknown,FBI,0.03006,0.0
3,4,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,...,43,White,Male,42,White,Acquaintance,Strangulation,FBI,0.043086,0.424242
4,5,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,...,30,Native American/Alaska Native,Unknown,0,Unknown,Unknown,Unknown,FBI,0.03006,0.0


## Z-Score

For this calculation we use the formula: X* = X - Mean(X) / Standard_Deviation(X)

In [16]:
df['Z-Score Victim Age'] = (df['Victim Age'] - victim_age_mean / victim_age_std)
df['Z-Score Perpetrator Age'] = (df['Perpetrator Age'] - perpetrator_age_mean / perpetrator_age_std)

In [17]:
df.head()

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,...,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Relationship,Weapon,Record Source,Normalized Victim Age,Normalized Perpetrator Age,Z-Score Victim Age,Z-Score Perpetrator Age
0,1,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,...,Male,15,Native American/Alaska Native,Acquaintance,Blunt Object,FBI,0.014028,0.151515,13.158506,13.863969
1,2,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,...,Male,42,White,Acquaintance,Strangulation,FBI,0.043086,0.424242,42.158506,40.863969
2,3,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,...,Unknown,0,Unknown,Unknown,Unknown,FBI,0.03006,0.0,29.158506,-1.136031
3,4,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,...,Male,42,White,Acquaintance,Strangulation,FBI,0.043086,0.424242,42.158506,40.863969
4,5,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,...,Unknown,0,Unknown,Unknown,Unknown,FBI,0.03006,0.0,29.158506,-1.136031


## Robust Scaler

In [18]:
scaler = RobustScaler()
df['Robust Scaler Victim Age'] = scaler.fit_transform(df[['Victim Age']])
df['Robust Scaler Perpetrator Age'] = scaler.fit_transform(df[['Perpetrator Age']])

In [19]:
df.head()

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,...,Perpetrator Race,Relationship,Weapon,Record Source,Normalized Victim Age,Normalized Perpetrator Age,Z-Score Victim Age,Z-Score Perpetrator Age,Robust Scaler Victim Age,Robust Scaler Perpetrator Age
0,1,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,...,Native American/Alaska Native,Acquaintance,Blunt Object,FBI,0.014028,0.151515,13.158506,13.863969,-0.8,-0.193548
1,2,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,...,White,Acquaintance,Strangulation,FBI,0.043086,0.424242,42.158506,40.863969,0.65,0.677419
2,3,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,...,Unknown,Unknown,Unknown,FBI,0.03006,0.0,29.158506,-1.136031,0.0,-0.677419
3,4,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,...,White,Acquaintance,Strangulation,FBI,0.043086,0.424242,42.158506,40.863969,0.65,0.677419
4,5,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,...,Unknown,Unknown,Unknown,FBI,0.03006,0.0,29.158506,-1.136031,0.0,-0.677419


In [20]:
# While doing the min and maximum age I've noticed that some Victim Ages from unknown cases were 
# described as 998, so I am leaving this note for the future so i can treat and drop these lines, 
# or do something else with them, like substituting for the mean/median
above_100 = (df['Victim Age'] == 998).sum()
above_100

df.drop(df[df['Victim Age'] > 100].index, inplace=True)