In this Notebook:
1. Preparation
2. Data cleaning

# 1. Preparation

In [1]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Notes: after noticing during EDA that some countries with higher patent outputs have stronger 
# relationships with the selected world development indicator variables than do countries with lower outputs, hypothesis changed
#Countries with higher patent output have higher GDP, spend more on R&D and have lower rates of teenage pregnancy

In [3]:
#define path
path = r"/Users/katerinapilota/Desktop/Project_6"

In [4]:
#ensure chart are automatically displayed
%matplotlib inline

In [5]:
#import data
df = pd.read_csv(os.path.join(path, 'Data', 'Prepared Data', 'patent_categories.csv'))

In [6]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Office,Office (Code),Applicant Origin - Country,"1 - Electrical machinery, apparatus, energy",2 - Audio-visual technology,3 - Telecommunications,4 - Digital communication,...,"33 - Furniture, games",34 - Other consumer goods,35 - Civil engineering,Year,Applicant Origin - Region,Total patent numbers,GDP PPP (per capita),R&D (% GDP),Ferrtility Rate (15-19 yo),Patent count category
0,0,0,0,United States of America,US,Albania,0,0,0,0,...,0,0,0,1999,Southern Europe,1,3471.656359,,19.1348,Low
1,1,1,1,France,FR,Andorra,0,1,0,0,...,0,0,0,1999,Southern Europe,1,0.0,,0.0,Low
2,2,2,2,European Patent Office,EP,Andorra,0,0,0,0,...,0,0,0,1999,Southern Europe,1,0.0,,0.0,Low
3,3,3,3,Spain,ES,Andorra,0,0,0,0,...,0,0,0,1999,Southern Europe,2,0.0,,0.0,Middle
4,4,4,4,France,FR,Andorra,0,0,0,0,...,0,0,0,1999,Southern Europe,1,0.0,,0.0,Low


In [7]:
df.shape

(39335, 48)

# 2. Data Cleaning

In [8]:
#dropping unnecessary index columns
df.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], inplace = True)

In [9]:
df.head()

Unnamed: 0,Office,Office (Code),Applicant Origin - Country,"1 - Electrical machinery, apparatus, energy",2 - Audio-visual technology,3 - Telecommunications,4 - Digital communication,5 - Basic communication processes,6 - Computer technology,7 - IT methods for management,...,"33 - Furniture, games",34 - Other consumer goods,35 - Civil engineering,Year,Applicant Origin - Region,Total patent numbers,GDP PPP (per capita),R&D (% GDP),Ferrtility Rate (15-19 yo),Patent count category
0,United States of America,US,Albania,0,0,0,0,0,0,0,...,0,0,0,1999,Southern Europe,1,3471.656359,,19.1348,Low
1,France,FR,Andorra,0,1,0,0,0,0,0,...,0,0,0,1999,Southern Europe,1,0.0,,0.0,Low
2,European Patent Office,EP,Andorra,0,0,0,0,0,0,0,...,0,0,0,1999,Southern Europe,1,0.0,,0.0,Low
3,Spain,ES,Andorra,0,0,0,0,0,0,0,...,0,0,0,1999,Southern Europe,2,0.0,,0.0,Middle
4,France,FR,Andorra,0,0,0,0,0,0,0,...,0,0,0,1999,Southern Europe,1,0.0,,0.0,Low


In [10]:
#checking for NaNs
df.isnull().sum()

Office                                             0
Office (Code)                                      0
Applicant Origin - Country                         0
1 - Electrical machinery, apparatus, energy        0
2 - Audio-visual technology                        0
3 - Telecommunications                             0
4 - Digital communication                          0
5 - Basic communication processes                  0
6 - Computer technology                            0
7 - IT methods for management                      0
8 - Semiconductors                                 0
9 - Optics                                         0
10 - Measurement                                   0
11 - Analysis of biological materials              0
12 - Control                                       0
13 - Medical technology                            0
14 - Organic fine chemistry                        0
15 - Biotechnology                                 0
16 - Pharmaceuticals                          

In [11]:
#There are so many missing values - maybe entire countries that would be important for the analysis
#check to see which tuples are affected by world indicator NaNs
Nans = df.loc[df.isnull().any(axis=1)]

In [12]:
Nans.shape

(13044, 45)

In [13]:
#view all result
pd.set_option('display.max_rows', None)

In [14]:
Nans['Applicant Origin - Country'].value_counts()

United States of America                 2844
Switzerland                              1616
Republic of Korea                        1337
Unknown                                   897
Australia                                 841
Russian Federation                        536
Liechtenstein                             465
South Africa                              399
Brazil                                    349
China, Hong Kong SAR                      317
Ukraine                                   194
Luxembourg                                193
New Zealand                               181
India                                     179
Croatia                                   123
Slovakia                                  115
Monaco                                    110
Republic of Moldova                       107
Netherlands Antilles                      104
Morocco                                    88
Barbados                                   86
Belarus                           

In [60]:
#delete unknown from dfs
df.drop(df[df['Applicant Origin - Country'] == 'Unknown'].index, inplace = True)
Nans.drop(Nans[Nans['Applicant Origin - Country'] == 'Unknown'].index, inplace = True)

In [59]:
Nans['Applicant Origin - Country'].value_counts()

United States of America                 2844
Switzerland                              1616
Republic of Korea                        1337
Australia                                 841
Russian Federation                        536
Liechtenstein                             465
South Africa                              399
Brazil                                    349
China, Hong Kong SAR                      317
Ukraine                                   194
Luxembourg                                193
New Zealand                               181
India                                     179
Croatia                                   123
Slovakia                                  115
Monaco                                    110
Republic of Moldova                       107
Netherlands Antilles                      104
Morocco                                    88
Belarus                                    86
Barbados                                   86
United Kingdom                    

In [63]:
#comparing total country records with records with Nans, eg. United States
USA = df[df['Applicant Origin - Country'] == 'United States of America']

In [64]:
USA

Unnamed: 0,Office,Office (Code),Applicant Origin - Country,"1 - Electrical machinery, apparatus, energy",2 - Audio-visual technology,3 - Telecommunications,4 - Digital communication,5 - Basic communication processes,6 - Computer technology,7 - IT methods for management,...,"33 - Furniture, games",34 - Other consumer goods,35 - Civil engineering,Year,Applicant Origin - Region,Total patent numbers,GDP PPP (per capita),R&D (% GDP),Ferrtility Rate (15-19 yo),Patent count category
35491,African Regional Intellectual Property Organiz...,AP,United States of America,0,0,0,0,0,0,0,...,0,0,0,1999,Northern America,1,,,,Low
35492,Australia,AU,United States of America,0,0,0,0,0,0,0,...,0,0,0,1999,Northern America,230,,,,High
35493,Austria,AT,United States of America,0,0,0,0,0,0,0,...,0,0,0,1999,Northern America,1,,,,Low
35494,Bulgaria,BG,United States of America,0,0,0,0,0,0,0,...,0,0,0,1999,Northern America,5,,,,Low
35495,Canada,CA,United States of America,0,0,0,0,0,0,0,...,0,0,0,1999,Northern America,292,,,,High
35496,China,CN,United States of America,0,0,0,0,0,0,0,...,0,0,0,1999,Northern America,56,,,,High
35497,Croatia,HR,United States of America,0,0,0,0,0,0,0,...,0,0,0,1999,Northern America,2,,,,Middle
35498,Czechoslovakia,CS,United States of America,0,0,0,0,0,0,0,...,0,0,0,1999,Northern America,1,,,,Low
35499,Czech Republic,CZ,United States of America,0,0,0,0,0,0,0,...,0,0,0,1999,Northern America,5,,,,Low
35500,Denmark,DK,United States of America,0,0,0,0,0,0,0,...,0,0,0,1999,Northern America,3,,,,Middle


In [65]:
AUS = df[df['Applicant Origin - Country'] == 'Australia']

In [66]:
AUS

Unnamed: 0,Office,Office (Code),Applicant Origin - Country,"1 - Electrical machinery, apparatus, energy",2 - Audio-visual technology,3 - Telecommunications,4 - Digital communication,5 - Basic communication processes,6 - Computer technology,7 - IT methods for management,...,"33 - Furniture, games",34 - Other consumer goods,35 - Civil engineering,Year,Applicant Origin - Region,Total patent numbers,GDP PPP (per capita),R&D (% GDP),Ferrtility Rate (15-19 yo),Patent count category
116,African Regional Intellectual Property Organiz...,AP,Australia,0,0,0,0,0,0,0,...,0,0,0,1999,Australia and New Zealand,1,25264.36436,,18.4394,Low
117,Australia,AU,Australia,0,0,0,0,0,0,0,...,0,0,0,1999,Australia and New Zealand,38,25264.36436,,18.4394,High
118,Canada,CA,Australia,0,0,0,0,0,0,0,...,0,0,0,1999,Australia and New Zealand,5,25264.36436,,18.4394,Low
119,China,CN,Australia,0,0,0,0,0,0,0,...,0,0,0,1999,Australia and New Zealand,4,25264.36436,,18.4394,Middle
120,Croatia,HR,Australia,0,0,0,0,0,0,0,...,0,0,0,1999,Australia and New Zealand,1,25264.36436,,18.4394,Low
121,Denmark,DK,Australia,0,0,0,0,0,0,0,...,0,0,0,1999,Australia and New Zealand,1,25264.36436,,18.4394,Low
122,Egypt,EG,Australia,0,0,0,0,0,0,0,...,0,0,0,1999,Australia and New Zealand,2,25264.36436,,18.4394,Middle
123,European Patent Office,EP,Australia,0,0,0,0,0,0,0,...,0,0,0,1999,Australia and New Zealand,7,25264.36436,,18.4394,Low
124,Finland,FI,Australia,0,0,0,0,0,0,0,...,0,0,0,1999,Australia and New Zealand,1,25264.36436,,18.4394,Low
125,Greece,GR,Australia,0,0,0,0,0,0,0,...,0,0,0,1999,Australia and New Zealand,1,25264.36436,,18.4394,Low
