In [1]:
import pandas as pd
import numpy as np

## read the JSON file that you saved in ex02

In [2]:
df = df = pd.read_json('../data/auto.json', orient='records')

In [3]:
pd.options.display.float_format = '{:.2f}'.format

In [4]:
df.shape

(725, 5)

## enrich the dataframe using a sample from that dataframe

In [5]:
number_examples = 200

In [6]:
# create a sample with 200 new observations
## The sample should not have new combinations of the car number, make and model –
## so the whole dataset will be consistent in these terms
new_df = df.iloc[np.random.randint(0, df.shape[0], number_examples)]

In [7]:
new_df.loc[:,['Refund']] = np.random.randint(1, 3, number_examples)
new_df.loc[:,['Fines']] = df['Fines'].max() * np.random.rand(number_examples)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [8]:
new_df.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
635,T6049O50RUS,2,7167.19,Ford,Focus
296,O02397197RUS,2,46037.63,Volkswagen,Jetta
478,8201XX154RUS,2,55909.13,Ford,Focus
517,9409H7178RUS,2,87549.2,Skoda,Octavia
584,T396KX197RUS,1,105615.91,Ford,Focus


In [9]:
# concatenate the sample with the initial dataframe to a new dataframe concat_rows
concat_rows = pd.concat([df, new_df], ignore_index=True)

In [10]:
concat_rows.shape

(925, 5)

## enrich the dataframe concat_rows by a new column with generated data

In [11]:
# create a series with the name year using random integers from 1980 to 2019
years = pd.Series(np.random.randint(1980, 2020, concat_rows.shape[0]), name='year')

In [12]:
fines = concat_rows.merge(years, left_index=True, right_index=True)

In [13]:
fines.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,year
0,Y163O8161RUS,2,3200.0,Ford,Focus,2007
1,E432XX77RUS,1,6500.0,Toyota,Camry,1987
2,7184TT36RUS,1,2100.0,Ford,Focus,1991
3,X582HE161RUS,2,2000.0,Ford,Focus,2012
4,92918M178RUS,1,5700.0,Ford,Focus,2014


## enrich the dataframe by the data from another dataframe

In [14]:
import requests

In [15]:
unique_people = df['CarNumber'].nunique()
unique_people

531

In [20]:
response = requests.get(f'https://api.census.gov/data/2010/surname?get=NAME,COUNT&RANK=1:{unique_people}')

In [21]:
names = pd.DataFrame(response.json()).drop(0).reset_index()[0]

In [22]:
owners = pd.concat([names, pd.Series(concat_rows['CarNumber'].unique())],
          axis=1,
          ignore_index=True).rename(columns={0: 'NAME', 1: 'CarNumber'})

In [23]:
def generate_number(n):
    letters = 'ABCEXKH'
    end = 'RUS'
    res = list()
    for _ in range(n):
        rand = np.random.randint(0, 1000, 2)
        rand_letters = np.random.choice(list(letters), 3)
        num = f'{rand_letters[0]}{rand[0]:03}{rand_letters[1]}{rand_letters[2]}{rand[1]:03}{end}'
#         print(num)
        res.append(num)
    return res

In [24]:
# append to the fines dataframe 5 more observations
# (come up with your own ideas of CarNumber, etc.)
number_examples = 5
tmp = fines.iloc[np.random.randint(0, fines.shape[0] + 1, number_examples)]
tmp.loc[:,['CarNumber']] = generate_number(number_examples)
tmp.loc[:,['Refund']] = np.random.randint(1, 3, number_examples)
tmp.loc[:,['Fines']] = df['Fines'].max() * np.random.rand(number_examples)
tmp.loc[:,['year']] = np.random.randint(1980, 2020, number_examples)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [25]:
fines = pd.concat([fines, tmp], ignore_index=True)

In [26]:
owners.shape

(531, 2)

In [27]:
# delete from the owners dataframe 20 observations and add 5 new observations
# (they are not the same as those you add to the fines dataframe)
owners = owners.drop(np.random.randint(0, owners.shape[0], 20))
tmp = owners.iloc[np.random.randint(0, owners.shape[0] + 1, number_examples)]
tmp.loc[:,['CarNumber']] = np.array(generate_number(number_examples)).reshape(-1, 1)
owners = pd.concat([owners, tmp], ignore_index=True)
owners

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,NAME,CarNumber
0,ACOSTA,Y163O8161RUS
1,ADAMS,E432XX77RUS
2,ADKINS,7184TT36RUS
3,AGUILAR,X582HE161RUS
4,AGUIRRE,92918M178RUS
...,...,...
512,FOSTER,H034CK979RUS
513,LITTLE,C113CA579RUS
514,WALLACE,B380XK806RUS
515,WELCH,H561AX850RUS


In [28]:
# join both dataframes
# the new dataframe should have only the car numbers that exist in both dataframes
fines.merge(owners, how='inner')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,year,NAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,2007,ACOSTA
1,Y163O8161RUS,2,1600.00,Ford,Focus,2012,ACOSTA
2,Y163O8161RUS,1,173399.52,Ford,Focus,2017,ACOSTA
3,E432XX77RUS,1,6500.00,Toyota,Camry,1987,ADAMS
4,E432XX77RUS,2,13000.00,Toyota,Camry,1982,ADAMS
...,...,...,...,...,...,...,...
887,O136HO197RUS,2,7800.00,Toyota,Corolla,1999,WU
888,O22097197RUS,1,24300.00,Ford,Focus,1986,YANG
889,M0309X197RUS,1,22300.00,Ford,Focus,1983,YOUNG
890,O673E8197RUS,2,600.00,Ford,Focus,2003,ZHANG


In [29]:
# the new dataframe should have all the car numbers that exist in both dataframes
fines.merge(owners, how='outer')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,year,NAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,2007.00,ACOSTA
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,2012.00,ACOSTA
2,Y163O8161RUS,1.00,173399.52,Ford,Focus,2017.00,ACOSTA
3,E432XX77RUS,1.00,6500.00,Toyota,Camry,1987.00,ADAMS
4,E432XX77RUS,2.00,13000.00,Toyota,Camry,1982.00,ADAMS
...,...,...,...,...,...,...,...
930,H034CK979RUS,,,,,,FOSTER
931,C113CA579RUS,,,,,,LITTLE
932,B380XK806RUS,,,,,,WALLACE
933,H561AX850RUS,,,,,,WELCH


In [30]:
# the new dataframe should have only the car numbers from the fines dataframe
fines.merge(owners, how='left')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,year,NAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,2007,ACOSTA
1,E432XX77RUS,1,6500.00,Toyota,Camry,1987,ADAMS
2,7184TT36RUS,1,2100.00,Ford,Focus,1991,ADKINS
3,X582HE161RUS,2,2000.00,Ford,Focus,2012,AGUILAR
4,92918M178RUS,1,5700.00,Ford,Focus,2014,AGUIRRE
...,...,...,...,...,...,...,...
925,C191CA760RUS,1,179284.81,Toyota,Camry,2017,
926,X314XC602RUS,2,76388.93,Ford,Focus,1997,
927,B315AH843RUS,1,52883.55,Ford,Focus,1990,
928,K191BK150RUS,1,65569.29,Ford,Focus,2015,


In [31]:
# the new dataframe should have only the car numbers from the owners dataframe
fines.merge(owners, how='right')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,year,NAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,2007.00,ACOSTA
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,2012.00,ACOSTA
2,Y163O8161RUS,1.00,173399.52,Ford,Focus,2017.00,ACOSTA
3,E432XX77RUS,1.00,6500.00,Toyota,Camry,1987.00,ADAMS
4,E432XX77RUS,2.00,13000.00,Toyota,Camry,1982.00,ADAMS
...,...,...,...,...,...,...,...
892,H034CK979RUS,,,,,,FOSTER
893,C113CA579RUS,,,,,,LITTLE
894,B380XK806RUS,,,,,,WALLACE
895,H561AX850RUS,,,,,,WELCH


## create a pivot table from the fines dataframe, it should look like this (the values are the sums of the fines), but only with all the years:

In [32]:
pd.pivot_table(fines, index=['Make', 'Model'], columns='year', values='Fines', aggfunc=np.sum)

Unnamed: 0_level_0,year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Ford,Focus,1008414.12,214156.44,480398.39,574153.21,415811.88,277756.8,308496.73,185236.1,331758.16,404385.13,...,745103.11,250285.51,1210642.68,150799.72,519384.56,675076.35,577332.77,541043.12,241506.97,543084.03
Ford,Mondeo,8600.0,,,,,,,,,,...,,,6700.0,,,34400.0,,63021.45,,
Skoda,Octavia,110925.19,,8200.0,125224.38,500.0,,166500.0,,3000.0,6800.0,...,3500.0,102183.08,53048.01,5100.0,30605.44,300.0,,17189.17,79837.14,2400.0
Toyota,Camry,4400.0,1000.0,13000.0,800.0,,151130.1,19800.0,91562.6,,300885.83,...,1000.0,,,,,,,191297.06,177714.6,
Toyota,Corolla,12700.0,144817.37,,139591.29,,,,6800.0,,,...,,89404.46,,,,,8000.0,,,900.0
Volkswagen,Golf,,3800.0,5800.0,87510.27,,176594.59,,,1300.0,,...,,24000.0,17513.25,,127504.95,18400.0,,500.0,,
Volkswagen,Jetta,46037.63,29198.76,,137938.66,,,,,,,...,1600.0,,136116.91,,,,9000.0,,,
Volkswagen,Passat,3000.0,18200.0,3800.0,12400.0,,,,10000.0,900.0,,...,,136040.82,12800.0,,3000.0,,1600.0,100.0,2000.0,
Volkswagen,Touareg,,5800.0,,,,,,,,,...,,,,,,,,,,


In [41]:
fines.to_csv('../data/fines.csv', index=False)
owners.to_csv('../data/owners.csv', index=False)

# Checklist

In [34]:
concat_rows.count()

CarNumber    925
Refund       925
Fines        925
Make         925
Model        914
dtype: int64

In [35]:
fines.count()

CarNumber    930
Refund       930
Fines        930
Make         930
Model        918
year         930
dtype: int64

In [36]:
len(owners)

517

In [37]:
len(fines)

930