In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("dataset.csv", low_memory=False)

# The data type of perpetrator's age was not seen as int initially, so there was the need to treat those values and 
# transform them to NaN (there is only one row with an NA value in 'Perpetrator Age')
df['Perpetrator Age'] = pd.to_numeric(df['Perpetrator Age'], errors='coerce').astype('Int64')
# Because there was only one row of more than 600k rows that missed the Perpetrator Age value, 
# I've decided to simply delete this line. 
df.dropna(inplace=True)

In [3]:
df.dtypes

Record ID            int64
Agency Code         object
Agency Name         object
Agency Type         object
City                object
State               object
Year                 int64
Month               object
Incident             int64
Crime Type          object
Crime Solved        object
Victim Sex          object
Victim Age           int64
Victim Race         object
Perpetrator Sex     object
Perpetrator Age      Int64
Perpetrator Race    object
Relationship        object
Weapon              object
Record Source       object
dtype: object

In [4]:
#number of lines with NA values count
nan_counts = df.isna().sum()
nan_counts

Record ID           0
Agency Code         0
Agency Name         0
Agency Type         0
City                0
State               0
Year                0
Month               0
Incident            0
Crime Type          0
Crime Solved        0
Victim Sex          0
Victim Age          0
Victim Race         0
Perpetrator Sex     0
Perpetrator Age     0
Perpetrator Race    0
Relationship        0
Weapon              0
Record Source       0
dtype: int64

In [5]:
df.head()

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,Crime Solved,Victim Sex,Victim Age,Victim Race,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Relationship,Weapon,Record Source
0,1,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,Yes,Male,14,Native American/Alaska Native,Male,15,Native American/Alaska Native,Acquaintance,Blunt Object,FBI
1,2,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,Yes,Male,43,White,Male,42,White,Acquaintance,Strangulation,FBI
2,3,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,No,Female,30,Native American/Alaska Native,Unknown,0,Unknown,Unknown,Unknown,FBI
3,4,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,Yes,Male,43,White,Male,42,White,Acquaintance,Strangulation,FBI
4,5,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,No,Female,30,Native American/Alaska Native,Unknown,0,Unknown,Unknown,Unknown,FBI


In [6]:
## While doing the min and maximum age I've noticed that some Victim Ages from unknown cases were 
## described as 998, so I am leaving this note for the future so i can treat and drop these lines, 
## or do something else with them, like substituting for the mean/median
above_100 = (df['Victim Age'] == 998).sum()
above_100

df.drop(df[df['Victim Age'] > 100].index, inplace=True)

## Tasks

a) Identify which variables are categorical, discrete and continuous in the chosen data set and show
using some visualization or plot. Explore whether there are missing values for any of the variables.

b) Calculate the statistical parameters (mean, median, minimum, maximum, and standard deviation)
for each of the numerical variables.

c) Apply Min-Max Normalization, Z-score Standardization and Robust scalar on the numerical data
variables.

d) Line, Scatter and Heatmaps can be used to show the correlation between the features of the
dataset.

e) Graphics and descriptive understanding should be provided along with Data Exploratory analysis
(EDA). Identify subgroups of features that can explore some interesting facts.

f) Apply dummy encoding to categorical variables (at least one variable used from the data set) and
discuss the benefits of dummy encoding to understand the categorical data.

g) Apply PCA with your chosen number of components. Write up a short profile of the first few
components extracted based on your understanding.

h) What is the purpose of dimensionality reduction? Explore the situations where you can gain the
benefit of dimensionality reduction for data analysis.

-----------------

a)

b) Numerical Values: Year, Victim Age, Perpetrator Age

## Mean Calculation

In [7]:
year_mean = df['Year'].mean()
year_mean = round(year_mean, 1)
victim_age_mean = df['Victim Age'].mean()
victim_age_mean = round(victim_age_mean, 2)
perpetrator_age_mean = df['Perpetrator Age'].mean()
perpetrator_age_mean = round(perpetrator_age_mean, 2)

## Mean Values

In [8]:
print("The mean for the Year values is: ", year_mean)
print("The mean for the Victim Age values is: ", victim_age_mean)
print("The mean for the Year values is: ", perpetrator_age_mean)

The mean for the Year values is:  1995.8
The mean for the Victim Age values is:  33.56
The mean for the Year values is:  20.34


## Median calculation

In [9]:
year_median = df['Year'].median()
victim_age_median = df['Victim Age'].median()
perpetrator_age_median = df['Perpetrator Age'].median()

## Median Values

In [10]:
print("The median for the Year values is: ", year_median)
print("The median for the Victim Age values is: ", victim_age_median)
print("The median for the Year values is: ", perpetrator_age_median)

The median for the Year values is:  1995.0
The median for the Victim Age values is:  30.0
The median for the Year values is:  21.0


## Minimum and Maximum Values Calculation

In [11]:
minimum_year = df['Year'].min()
maximum_year = df['Year'].max()

minimum_victim_age = df['Victim Age'].min()
maximum_victim_age = df['Victim Age'].max()

minimum_perpetrator_age = df['Perpetrator Age'].min()
maximum_perpetrator_age = df['Perpetrator Age'].max()

## Minimum and Maximum Values

In [12]:
print("This dataset starts in ", minimum_year, "and finishes at ", maximum_year)
print("The youngest victims were ", minimum_victim_age, " years old, and the oldest", maximum_victim_age)
print("The youngest perpetrators were", minimum_perpetrator_age, "and the oldest were", maximum_perpetrator_age)

This dataset starts in  1980 and finishes at  2014
The youngest victims were  0  years old, and the oldest 99
The youngest perpetrators were 0 and the oldest were 99


## Standard Deviation


In [13]:
year_std = df['Year'].std()
victim_age_std = df['Victim Age'].std()
perpetrator_age_std = df['Perpetrator Age'].std()

In [14]:
print("The Year's Standard Deviation value is: ", year_std)
print("The Victim Age's Standard Deviation is: ", victim_age_std)
print("The Perpetrator Age's Standard Deviation is: ", perpetrator_age_std)

The Year's Standard Deviation value is:  9.925971892548677
The Victim Age's Standard Deviation is:  17.79260103274315
The Perpetrator Age's Standard Deviation is:  17.885289040116565


## Min-Max Normalization

For this calculation we have the following formula:

X* = X - min(X)/ max(X) - min(X)


In [15]:
df['Normalized Year'] = (df['Year'] - minimum_year) / (maximum_year - minimum_year)
df['Normalized Victim Age'] = (df['Victim Age'] - minimum_victim_age) / (maximum_victim_age - minimum_victim_age)
df['Normalized Perpetrator Age'] = (df['Perpetrator Age'] - minimum_perpetrator_age) / (maximum_perpetrator_age - minimum_perpetrator_age)

In [17]:
df.head()

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,...,Victim Race,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Relationship,Weapon,Record Source,Normalized Year,Normalized Victim Age,Normalized Perpetrator Age
0,1,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,...,Native American/Alaska Native,Male,15,Native American/Alaska Native,Acquaintance,Blunt Object,FBI,0.0,0.141414,0.151515
1,2,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,...,White,Male,42,White,Acquaintance,Strangulation,FBI,0.0,0.434343,0.424242
2,3,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,...,Native American/Alaska Native,Unknown,0,Unknown,Unknown,Unknown,FBI,0.0,0.30303,0.0
3,4,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,...,White,Male,42,White,Acquaintance,Strangulation,FBI,0.0,0.434343,0.424242
4,5,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,...,Native American/Alaska Native,Unknown,0,Unknown,Unknown,Unknown,FBI,0.0,0.30303,0.0


## Z-Score

For this calculation we use the formula: X* = X - Mean(X) / Standard_Deviation(X)

In [18]:
df['Z-Score Year'] = (df['Year'] - year_mean / year_std)
df['Z-Score Victim Age'] = (df['Victim Age'] - victim_age_mean / victim_age_std)
df['Z-Score Perpetrator Age'] = (df['Perpetrator Age'] - perpetrator_age_mean / perpetrator_age_std)

In [20]:
df.head()

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,...,Perpetrator Race,Relationship,Weapon,Record Source,Normalized Year,Normalized Victim Age,Normalized Perpetrator Age,Z-Score Year,Z-Score Victim Age,Z-Score Perpetrator Age
0,1,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,...,Native American/Alaska Native,Acquaintance,Blunt Object,FBI,0.0,0.141414,0.151515,1778.931528,12.113823,13.862753
1,2,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,...,White,Acquaintance,Strangulation,FBI,0.0,0.434343,0.424242,1778.931528,41.113823,40.862753
2,3,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,...,Unknown,Unknown,Unknown,FBI,0.0,0.30303,0.0,1778.931528,28.113823,-1.137247
3,4,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,...,White,Acquaintance,Strangulation,FBI,0.0,0.434343,0.424242,1778.931528,41.113823,40.862753
4,5,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,...,Unknown,Unknown,Unknown,FBI,0.0,0.30303,0.0,1778.931528,28.113823,-1.137247
