In [5]:
import pandas as pd

df = pd.read_csv("./data_cts_violent_crime.csv")
df = df.drop_duplicates()
df = df.drop(columns=['Iso3_code', 'Region', 'Subregion', 'Dimension', 'Source', 'Indicator', 'Category'])
drop_pop = df[df['Unit of measurement'] == 'Rate per 100,000 population'].index
df = df.drop(drop_pop)

In [6]:
print(df.isnull().any())

Country                False
Sex                    False
Age                    False
Year                   False
Unit of measurement    False
VALUE                  False
dtype: bool


In [7]:
# Generate dummy variables for 'Sex'
df = pd.get_dummies(df, columns=['Sex'], prefix='Gender')

# Convert dummy columns to integers
df = df.astype({'Gender_Female': 'int', 'Gender_Male': 'int', 'Gender_Total': 'int'})
df

Unnamed: 0,Country,Age,Year,Unit of measurement,VALUE,Gender_Female,Gender_Male,Gender_Total
0,Azerbaijan,Total,2003,Counts,155.0,0,0,1
1,Belgium,Total,2003,Counts,61959.0,0,0,1
2,Bulgaria,Total,2003,Counts,3806.0,0,0,1
3,Bahrain,Total,2003,Counts,2701.0,0,0,1
4,Belarus,Total,2003,Counts,4032.0,0,0,1
...,...,...,...,...,...,...,...,...
13068,Montenegro,Total,2021,Counts,10.0,0,0,1
13069,Mauritius,Total,2021,Counts,342.0,0,0,1
13070,El Salvador,Total,2021,Counts,4.0,0,0,1
13071,Serbia,Total,2021,Counts,1.0,0,0,1


In [8]:
#VALUE should not be fractions
df['VALUE'] = df['VALUE'].astype(int)
df = df.drop(columns=['Unit of measurement'])
df

Unnamed: 0,Country,Age,Year,VALUE,Gender_Female,Gender_Male,Gender_Total
0,Azerbaijan,Total,2003,155,0,0,1
1,Belgium,Total,2003,61959,0,0,1
2,Bulgaria,Total,2003,3806,0,0,1
3,Bahrain,Total,2003,2701,0,0,1
4,Belarus,Total,2003,4032,0,0,1
...,...,...,...,...,...,...,...
13068,Montenegro,Total,2021,10,0,0,1
13069,Mauritius,Total,2021,342,0,0,1
13070,El Salvador,Total,2021,4,0,0,1
13071,Serbia,Total,2021,1,0,0,1


In [9]:
df = pd.get_dummies(df, columns=['Age'], prefix='Age')
print(df.columns)

Index(['Country', 'Year', 'VALUE', 'Gender_Female', 'Gender_Male',
       'Gender_Total', 'Age_Total'],
      dtype='object')


In [10]:
df['Age_Under18'] = 0
df['Age_Over18'] = 0
df = df.astype({'Age_Total': 'int'})
df

Unnamed: 0,Country,Year,VALUE,Gender_Female,Gender_Male,Gender_Total,Age_Total,Age_Under18,Age_Over18
0,Azerbaijan,2003,155,0,0,1,1,0,0
1,Belgium,2003,61959,0,0,1,1,0,0
2,Bulgaria,2003,3806,0,0,1,1,0,0
3,Bahrain,2003,2701,0,0,1,1,0,0
4,Belarus,2003,4032,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...
13068,Montenegro,2021,10,0,0,1,1,0,0
13069,Mauritius,2021,342,0,0,1,1,0,0
13070,El Salvador,2021,4,0,0,1,1,0,0
13071,Serbia,2021,1,0,0,1,1,0,0


In [11]:
df['Year'] = pd.to_datetime(df.Year, format='%Y')
print(df.dtypes)

Country                  object
Year             datetime64[ns]
VALUE                     int32
Gender_Female             int32
Gender_Male               int32
Gender_Total              int32
Age_Total                 int32
Age_Under18               int64
Age_Over18                int64
dtype: object


In [12]:
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [13]:
unique_countries = df['Country'].unique()
print(unique_countries)
n = len(pd.unique(df['Country']))
print(n)

['Azerbaijan' 'Belgium' 'Bulgaria' 'Bahrain' 'Belarus' 'Bermuda'
 'Brunei Darussalam' 'Canada' 'Switzerland' 'Cameroon' 'Cyprus' 'Germany'
 'Denmark' 'Algeria' 'Ecuador' 'Finland' 'France'
 'United Kingdom (England and Wales)' 'United Kingdom (Northern Ireland)'
 'United Kingdom (Scotland)' 'Greece'
 'China, Hong Kong Special Administrative Region' 'Croatia' 'Hungary'
 'Ireland' 'Japan' 'Kyrgyzstan' 'Liechtenstein' 'Sri Lanka' 'Lithuania'
 'Latvia' 'Morocco' 'Monaco' 'Montenegro' 'Mongolia' 'Mauritius'
 'Netherlands (Kingdom of the)' 'Norway' 'Portugal' 'State of Palestine'
 'Qatar' 'Romania' 'Singapore' 'Sao Tome and Principe' 'Slovakia'
 'Slovenia' 'Sweden' 'Eswatini' 'Syrian Arab Republic' 'Türkiye' 'Uganda'
 'Ukraine' 'United States of America' 'Zimbabwe' 'Armenia' 'Austria'
 'Barbados' 'Colombia' 'Czechia' 'Georgia' 'Guatemala' 'India' 'Israel'
 'Italy' 'Kuwait' 'Mexico' 'Mozambique' 'Peru' 'Russian Federation'
 'Solomon Islands' 'Trinidad and Tobago'
 'Saint Vincent and the Grena

In [14]:
lack_data = {country: len(data) for country, data in df.groupby('Country') if len(data) < 100}
lack_data

{'Algeria': 83,
 'Andorra': 69,
 'Antigua and Barbuda': 73,
 'Argentina': 52,
 'Armenia': 82,
 'Australia': 68,
 'Bahrain': 19,
 'Bangladesh': 6,
 'Belarus': 82,
 'Belize': 98,
 'Benin': 4,
 'Bermuda': 48,
 'Bhutan': 60,
 'Bosnia and Herzegovina': 87,
 'Botswana': 17,
 'Brazil': 60,
 'Brunei Darussalam': 12,
 'Bulgaria': 89,
 'Burundi': 26,
 'Cabo Verde': 41,
 'Cameroon': 55,
 'China, Hong Kong Special Administrative Region': 67,
 'Côte d’Ivoire': 4,
 'Denmark': 99,
 'Dominica': 71,
 'Ecuador': 86,
 'Egypt': 30,
 'Estonia': 85,
 'Eswatini': 12,
 'Georgia': 39,
 'Ghana': 25,
 'Guinea': 10,
 'Guinea-Bissau': 12,
 'Haiti': 3,
 'Holy See': 75,
 'Iceland': 81,
 'India': 40,
 'Indonesia': 54,
 'Iraq (Central Iraq)': 34,
 'Israel': 66,
 'Jamaica': 82,
 'Japan': 72,
 'Jordan': 32,
 'Kazakhstan': 47,
 'Kenya': 89,
 'Kosovo under UNSCR 1244': 86,
 'Kuwait': 24,
 'Kyrgyzstan': 75,
 'Lebanon': 56,
 'Lesotho': 9,
 'Liechtenstein': 77,
 'Luxembourg': 79,
 'Madagascar': 30,
 'Malaysia': 17,
 'Maldive

In [15]:
countries_to_drop = list(lack_data.keys())

df = df[~df['Country'].isin(countries_to_drop)]
df

Unnamed: 0,Country,Year,VALUE,Gender_Female,Gender_Male,Gender_Total,Age_Total,Age_Under18,Age_Over18
0,Azerbaijan,2003-01-01,155,0,0,1,1,0,0
1,Belgium,2003-01-01,61959,0,0,1,1,0,0
7,Canada,2003-01-01,48135,0,0,1,1,0,0
8,Switzerland,2003-01-01,6732,0,0,1,1,0,0
10,Cyprus,2003-01-01,135,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...
13064,Croatia,2021-01-01,75,0,0,1,1,0,0
13066,"China, Macao Special Administrative Region",2021-01-01,11,0,0,1,1,0,0
13067,Morocco,2021-01-01,167,0,0,1,1,0,0
13070,El Salvador,2021-01-01,4,0,0,1,1,0,0


In [16]:
# Move 'VALUE' column to the last position
cols = [col for col in df.columns if col != 'VALUE']  # All columns except 'VALUE'
cols.append('VALUE')  # Add 'VALUE' to the end
df = df[cols]  # Rearrange columns

df

Unnamed: 0,Country,Year,Gender_Female,Gender_Male,Gender_Total,Age_Total,Age_Under18,Age_Over18,VALUE
0,Azerbaijan,2003-01-01,0,0,1,1,0,0,155
1,Belgium,2003-01-01,0,0,1,1,0,0,61959
7,Canada,2003-01-01,0,0,1,1,0,0,48135
8,Switzerland,2003-01-01,0,0,1,1,0,0,6732
10,Cyprus,2003-01-01,0,0,1,1,0,0,135
...,...,...,...,...,...,...,...,...,...
13064,Croatia,2021-01-01,0,0,1,1,0,0,75
13066,"China, Macao Special Administrative Region",2021-01-01,0,0,1,1,0,0,11
13067,Morocco,2021-01-01,0,0,1,1,0,0,167
13070,El Salvador,2021-01-01,0,0,1,1,0,0,4


In [17]:
columns = df.columns.tolist()
columns.remove('Age_Total')
columns.insert(len(columns)-1, 'Age_Total')
df = df[columns]
df

Unnamed: 0,Country,Year,Gender_Female,Gender_Male,Gender_Total,Age_Under18,Age_Over18,Age_Total,VALUE
0,Azerbaijan,2003-01-01,0,0,1,0,0,1,155
1,Belgium,2003-01-01,0,0,1,0,0,1,61959
7,Canada,2003-01-01,0,0,1,0,0,1,48135
8,Switzerland,2003-01-01,0,0,1,0,0,1,6732
10,Cyprus,2003-01-01,0,0,1,0,0,1,135
...,...,...,...,...,...,...,...,...,...
13064,Croatia,2021-01-01,0,0,1,0,0,1,75
13066,"China, Macao Special Administrative Region",2021-01-01,0,0,1,0,0,1,11
13067,Morocco,2021-01-01,0,0,1,0,0,1,167
13070,El Salvador,2021-01-01,0,0,1,0,0,1,4


In [73]:
df.to_csv('D1_data.csv', index=False)