# Task 1: Data Preparation

### Imports & format checks

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Import nessasary libraries

In [None]:
# Load the data from the file NBA_players_stats.csv using pandas library.
NBA = pd.read_csv('NBA_players_stats.csv', index_col=0) #prevent index number from affecting result

In [None]:
NBA.columns

In [None]:
NBA.dtypes # Get data type for each column & verify they are correct

In [None]:
missing = False 
for feature in NBA:
    for observation in feature:
        if observation is None or observation == '' or observation == np.NaN: # check for None types, empty strings, or the NumPy type NaN
            missing = True
if missing == True:
    print("There are missing values")
else:
    print("There are no missing values")

The code above checks for any missing values in the dataset, which can be done by inspection but it will take a while.
As we see, there are no missing values. 

This code shows us players who appear more than once in the dataset, as well as the row the duplicate is located and deal with it when cleaning the data. 

In [None]:
duplicate_names = NBA.duplicated('Player')
NBA[['Player']][duplicate_names]

## Data Cleaning and checking
First check for errors then clean the data where nessasary

### Data Cleaning - Player

In [None]:
NBA['Player'] = NBA['Player'].astype(pd.StringDtype()) 

### Data Cleaning - Pos

In [None]:
NBA['Pos'] = NBA['Pos'].str.lstrip()  # or remove space at d begining of a string
NBA['Pos'] = NBA['Pos'].str.rstrip()  #remove space at d end of a string

# replacing some irregular values in Pos column
NBA['Pos'] = NBA['Pos'].replace('SGa', 'SG')  
NBA['Pos'] = NBA['Pos'].replace('sg', 'SG')
NBA['Pos'] = NBA['Pos'].replace('SF.', 'SF')
NBA['Pos'] = NBA['Pos'].replace('Sf', 'SF')

NBA['Pos'] = NBA['Pos'].replace('pg', 'PG')

NBA['Pos'] = NBA['Pos'].replace('PFa', 'PF')
NBA['Pos'] = NBA['Pos'].replace('Pg', 'PG')
NBA['Pos'] = NBA['Pos'].astype(pd.StringDtype()) 

### Data Cleaning - Age

In [None]:
min = NBA["Age"].min()
max = NBA["Age"].max()
print ("The youngest player is aged: ", min)
print ("The oldest player is aged: ", max)

Upon checking the Age, we see the youngest player is age: -19 and the oldest is 280. These are obviously outside the human life expectancy. 

In [None]:
print ("Players who are over 40: ]                                                                                              ")
NBA[['Age', 'Player']][NBA.Age > 40]

remove the irregular values on age column and set them to the 'intended' input.

In [None]:
NBA['Age'] = np.where((NBA.Age == 280), 28, NBA.Age)
NBA['Age'] = np.where((NBA.Age == -19), 19, NBA.Age)

### Data Cleaning - Team

In [None]:
NBA['Tm'] = NBA['Tm'].astype(pd.StringDtype()) 
NBA['Tm'] = NBA['Tm'].str.rstrip() # remove space at d end of a string
NBA['Tm'] = NBA['Tm'].str.lstrip()  # remove space at d begining of a string
NBA['Tm'] = NBA['Tm'].replace('H0U', 'HOU')
NBA['Tm'] = NBA['Tm'].replace('NyK', 'NYK')

### Data Cleaning - G

In [None]:
NBA.G.max()    ### max is less than 82

In [None]:
NBA.G.unique()

### Data Cleaning - GS

In [None]:
NBA.GS.unique()

### Data Cleaning - MP

In [None]:
NBA.MP.unique()

### Data Cleaning - FG

In [None]:
NBA.FG.unique()   #check for inappropriate value

### Data Cleaning - FGA

In [None]:
NBA.FGA.unique()

### Data Cleaning - FG%

In [None]:
NBA['FG%'].unique()

In [None]:
checkna = NBA['FG%'].isnull().values.any()
checkna

In [None]:
fg = NBA[NBA['FG%'].isnull()].head()
fg[['FG','FGA']]

In [None]:
NBA['FG%'] = NBA['FG%'].fillna(0)

### Data Cleaning - 3P

In [None]:
NBA['3P'].unique()

In [None]:
checkna = NBA['3P'].isnull().values.any()
checkna

### Data Cleaning - 3PA

In [None]:
NBA['3PA'].unique()

In [None]:
checkna = NBA['3PA'].isnull().values.any()
checkna

### Data Cleaning - 3P%

In [None]:
NBA['3P%'].unique()

In [None]:
checkna = NBA['3P%'].isnull().values.any()
checkna

In [None]:
threePercent = NBA[NBA['3P%'].isnull()].head()
threePercent[['3P','3PA']]

Fill nan with zero and recheck again

In [None]:
NBA['3P%'] = NBA['3P%'].fillna(0)
checkna = NBA['3P%'].isnull().values.any()
checkna

### Data Cleaning - 2P

In [None]:
NBA['2P'].unique()

In [None]:
checkna = NBA['2P'].isnull().values.any()
checkna

### Data Cleaning - 2PA

In [None]:
NBA['2PA'].unique()

In [None]:
checkna = NBA['2PA'].isnull().values.any()
checkna

### Data Cleaning - 2P%

In [None]:
NBA['2P%'].unique()

In [None]:
checkna = NBA['2P%'].isnull().values.any()
checkna

In [None]:
twoPercent = NBA[NBA['2P%'].isnull()].head()
twoPercent[['2P','2PA']]

In [None]:
NBA['2P%'] = NBA['2P%'].fillna(0)
checkna = NBA['2P%'].isnull().values.any()
checkna

### Data Cleaning - FT

In [None]:
NBA['FT'].unique()

In [None]:
checkna = NBA['FT'].isnull().values.any()
checkna

### Data Cleaning - FTA

In [None]:
NBA['FTA'].unique()

In [None]:
checkna = NBA['FTA'].isnull().values.any()
checkna

### Data Cleaning - FT%

In [None]:
NBA['FT%'].unique()

In [None]:
checkna = NBA['FT%'].isnull().values.any()
checkna

In [None]:
threePercent = NBA[NBA['FT%'].isnull()].head()
threePercent[['FT','FTA']]

In [None]:
NBA['FT%'] = NBA['FT%'].fillna(0)
checkna = NBA['FT%'].isnull().values.any()
checkna

### Data Cleaning - ORB

In [None]:
NBA['ORB'].unique()

In [None]:
checkna = NBA['ORB'].isnull().values.any()
checkna

### Data Cleaning - DRB

In [None]:
NBA['DRB'].unique()

In [None]:
checkna = NBA['DRB'].isnull().values.any()
checkna

### Data Cleaning - TRB

In [None]:
NBA['TRB'].unique()

In [None]:
checkna = NBA['TRB'].isnull().values.any()
checkna

### Data Cleaning - AST

In [None]:
NBA['AST'].unique()

In [None]:
checkna = NBA['AST'].isnull().values.any()
checkna

### Data Cleaning - STL

In [None]:
NBA['STL'].unique()

In [None]:
checkna = NBA['STL'].isnull().values.any()
checkna

### Data Cleaning - BLK

In [None]:
NBA['BLK'].unique()

In [None]:
checkna = NBA['BLK'].isnull().values.any()
checkna

### Data Cleaning - TOV

In [None]:
NBA['TOV'].unique()

### Data Cleaning - TRB

In [None]:
checkna = NBA['TRB'].isnull().values.any()
checkna

### Data Cleaning - PF

In [None]:
NBA['PF'].unique()

In [None]:
checkna = NBA['PF'].isnull().values.any()
checkna

### Data Cleaning - PTS

In [None]:
NBA.loc[NBA['PTS'] > 2000]

In [None]:
NBA.loc[NBA['PTS'] > 2000, 'PTS'] = list((NBA.loc[NBA['PTS'] > 2000].PTS/100).astype(int))

In [None]:
checkna = NBA['PTS'].isnull().values.any()
checkna

To clean duplicate player data, add their int and float values thoughout the season and pick their current team and position.

In [None]:
NBA = NBA.groupby(['Rk','Player','Age']).agg({
 'Pos':','.join,
 'Tm': ','.join,
 'G':'sum',
 'GS':'sum',
 'MP':'sum',
 'FG':'sum',
 'FGA':'sum',
 'FG%':'sum',
 '3P':'sum',
 '3PA':'sum',
 '3P%':'sum',
 '2P':'sum',
 '2PA':'sum',
 '2P%':'sum',
 'FT':'sum',
 'FTA':'sum',
 'FT%':'sum',
 'ORB':'sum',
 'DRB':'sum',
 'TRB':'sum',
 'AST':'sum',
 'STL':'sum',
 'BLK':'sum',
 'TOV':'sum',
 'PF':'sum',
 'PTS':'sum'})

NBA['Tm'] = list(NBA.Tm.str.split(',').str[-1].str.strip(')'))
NBA['Pos'] = list(NBA.Pos.str.split(',').str[-1].str.strip(')'))
NBA = NBA.reset_index(level=[1,2])
NBA['Player'] = NBA['Player'].astype(pd.StringDtype()) 
NBA['Pos'] = NBA['Pos'].astype(pd.StringDtype()) 
NBA['Tm'] = NBA['Tm'].astype(pd.StringDtype()) 
NBA.head(30)

In [None]:
NBA.dtypes

In [None]:
NBA.to_csv('cleaned_NBA_players_stats.csv')
# Output cleaned data file as required

## Checking Data for errors 
This section is for verifying no errors remain after the check and cleanup

### Checking Data for errors - Player

In [None]:
dtyp = NBA['Player'].dtype
checkna = NBA['Player'].isnull().values.any()
lent = len(NBA['Player'].unique())
print ("The data type is: ", dtyp) # expected type String
print ("Is there NaN values: ", checkna) # expect False
print ("Numbers of team available is: ", lent) #Expected: 492 

### Checking Data for errors - Age

In [None]:
min = NBA["Age"].min()
max = NBA["Age"].max()
print ("The youngest player is aged: ", min)
print ("The oldest player is aged: ", max)

In [None]:
print ("Players who are over 40: ]                                                                                              ")
NBA[['Age', 'Player']][NBA.Age > 40]

In [None]:
print ("Players who are younger than 18: ")
NBA[['Age', 'Player']][NBA.Age < 18]

### Checking Data for errors - Position

In [None]:
dtyp = NBA['Pos'].dtype
checkna = NBA['Pos'].isnull().values.any()
lent = len(NBA['Pos'].unique())
print ("The data type is: ", dtyp) # expected type int
print ("Is there NaN values: ", checkna) # expect False
print ("Numbers of positions available is: ", lent) # Expect to be >7

### Checking Data for errors - Team

In [None]:
dtyp = NBA['Tm'].dtype
checkna = NBA['Tm'].isnull().values.any()
lent = len(NBA['Tm'].unique())
print ("The data type is: ", dtyp) # expected type int
print ("Is there NaN values: ", checkna) # expect False
print ("Numbers of team available is: ", lent) # expect =< 31

### Checking Data for errors - Games

In [None]:
dtyp = NBA['G'].dtype
checkna = NBA['G'].isnull().values.any()
print ("The data type is: ", dtyp) # expected type int
print ("Is there NaN values: ", checkna) # expect False

### Checking Data for errors - Games Started

In [None]:
dtyp = NBA['GS'].dtype
checkna = NBA['GS'].isnull().values.any()
print ("The data type is: ", dtyp) # expected type int
print ("Is there NaN values: ", checkna) # expect False

### Checking Data for errors - Minutes Played

In [None]:
dtyp = NBA['PF'].dtype
checkna = NBA['PF'].isnull().values.any()
print ("The data type is: ", dtyp) # expected type int
print ("Is there NaN values: ", checkna) # expect False

### Checking Data for errors - Field Goals

In [None]:
dtyp = NBA['FG'].dtype
checkna = NBA['FG'].isnull().values.any()
print ("The data type is: ", dtyp) # expected type int
print ("Is there NaN values: ", checkna) # expect False

### Checking Data for errors - Field Goal Attempts

In [None]:
dtyp = NBA['FGA'].dtype
checkna = NBA['FGA'].isnull().values.any()
print ("The data type is: ", dtyp) # expected type int
print ("Is there NaN values: ", checkna) # expect False

### Checking Data for errors - Field Goal Percentage

In [None]:
dtyp = NBA['FG%'].dtype
checkna = NBA['FG%'].isnull().values.any()
print ("The data type is: ", dtyp) # expected type float
print ("Is there NaN values: ", checkna) # expected False

### Checking Data for errors - 3-Point Field Goals

In [None]:
dtyp = NBA['3P'].dtype
checkna = NBA['3P'].isnull().values.any()
print ("The data type is: ", dtyp) # expected type int
print ("Is there NaN values: ", checkna) # expected False

### Checking Data for errors - 3-Point Field Goal Attempts

In [None]:
dtyp = NBA['3PA'].dtype
checkna = NBA['3PA'].isnull().values.any()
print ("The data type is (should be int): ", dtyp) # expected type int
print ("Is there NaN values: ", checkna) # expected False

### Checking Data for errors - 3-Point Field Goal Percentage

In [None]:
dtyp = NBA['3P%'].dtype
checkna = NBA['3P%'].isnull().values.any()
print ("The data type is: ", dtyp) # expected type float
print ("Is there NaN values: ", checkna) # expected False

### Checking Data for errors - 2-Point Field Goals

In [None]:
dtyp = NBA['2P'].dtype
checkna = NBA['2P'].isnull().values.any()
print ("The data type is: ", dtyp) # expected type int
print ("Is there NaN values: ", checkna) # expected False

### Checking Data for errors - 2-Point Field Goal Attempts

In [None]:
dtyp = NBA['2PA'].dtype
checkna = NBA['2PA'].isnull().values.any()
print ("The data type is: ", dtyp) # expected type int
print ("Is there NaN values: ", checkna) # expected False

### Checking Data for errors - 2-Point Field Goal Percentage

In [None]:
dtyp = NBA['2P%'].dtype
checkna = NBA['2P%'].isnull().values.any()
print ("The data type is (should be float): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)

### Checking Data for errors - Free Throws

In [None]:
dtyp = NBA['FT'].dtype
checkna = NBA['FT'].isnull().values.any()
print ("The data type is (should be int): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)

### Checking Data for errors - Free Throw Attempts

In [None]:
dtyp = NBA['FTA'].dtype
checkna = NBA['FTA'].isnull().values.any()
print ("The data type is (should be int): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)

### Checking Data for errors - Free Throw Percentage

In [None]:
dtyp = NBA['FT%'].dtype
checkna = NBA['FT%'].isnull().values.any()
print ("The data type is (should be float): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)

### Checking Data for errors - Offensive Rebounds

In [None]:
dtyp = NBA['ORB'].dtype
checkna = NBA['ORB'].isnull().values.any()
print ("The data type is (should be int): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)

### Checking Data for errors - Defensive Rebounds

In [None]:
dtyp = NBA['DRB'].dtype
checkna = NBA['DRB'].isnull().values.any()
print ("The data type is (should be int): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)

### Checking Data for errors - Total Rebounds

In [None]:
dtyp = NBA['TRB'].dtype
checkna = NBA['TRB'].isnull().values.any()
print ("The data type is (should be int): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)

### Checking Data for errors - Assists

In [None]:
dtyp = NBA['AST'].dtype
checkna = NBA['AST'].isnull().values.any()
print ("The data type is (should be int): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)

### Checking Data for errors - Steals

In [None]:
dtyp = NBA['STL'].dtype
checkna = NBA['STL'].isnull().values.any()
print ("The data type is (should be int): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)

### Checking Data for errors - Blocks

In [None]:
dtyp = NBA['BLK'].dtype
checkna = NBA['BLK'].isnull().values.any()
print ("The data type is (should be int): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)

### Checking Data for errors - Turnovers

In [None]:
dtyp = NBA['TOV'].dtype
checkna = NBA['TOV'].isnull().values.any()
print ("The data type is (should be int): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)

### Checking Data for errors - Personal Fouls

In [None]:
dtyp = NBA['PF'].dtype
checkna = NBA['PF'].isnull().values.any()
print ("The data type is (should be int): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)

### Checking Data for errors - The Total Points

In [None]:
dtyp = NBA['PTS'].dtype
checkna = NBA['PTS'].isnull().values.any()
min = NBA["PTS"].min()
max = NBA["PTS"].max()
print ("The data type is (should be int): ", dtyp)
print ("Is there NaN values (should be False): ", checkna)
print ("The highest point is (should be > 2000): ", max)
print ("The lowest point is (should not be > 0): ", min)

# Task 2: Data Exploration

## Task 2.1 


In [None]:
sort_by_Tpoints = NBA.sort_values('PTS',ascending=False).head(5)
sort_by_Tpoints

Chart of the distribution of total points between the top five players based on PTS.

In [None]:
sort_by_Tpoints.plot.pie(y='PTS', figsize=(10, 10))

In [None]:
sort_by_Tpoints.plot.bar(x='Player',y='PTS', rot=20, figsize=(13, 20))

## Task 2.2 

In [None]:
NBA = pd.read_csv('NBA_players_stats.csv', index_col=0) #prevent index number from affecting result
#Reloading the file to use the original file rather than the cleaned data as per question specs
NBAE = NBA[["3P","3PA","3P%"]]
NBAE.head()

In [None]:
NBAE.info()

In [None]:
NBAE.describe()  #using describe function we can sense that there are some missing values on 3P% at row count

In [None]:
sns.heatmap(NBAE.isnull(), cbar=False)
"""
[1]"seaborn heatmap - Python Tutorial", Pythonbasics.org, 2021. [Online]. Available: https://pythonbasics.org/seaborn-heatmap/. [Accessed: 09- Apr- 2021].
Only citation in notebook so I put it here rather than make a whole section for a single citation
"""

The heatmap quickly shows missing values on 3P% column.

Fixing the error by filling nan with zeros

In [None]:
NBAE['3P%'] = NBAE['3P%'].fillna(0)
NBAE

## Task 2.3 


In [None]:
columns = NBA[['Player','FG','3P','2P','PTS']]
columns = columns.head(10)
columns.loc[columns['PTS'] > 2000, 'PTS'] = list((columns.loc[columns['PTS'] > 2000].PTS/100).astype(int))
columns

In [None]:
columns.plot.bar()