### Pandas 

* pandas stands for python data analysis library
* It is a free and open source
* It is an excellent tool for Data Processing and analyzing real world data

### Pandas data structures

* 1. Series : Used to create 1 Dimensional array with named index 
                * eg: single column in an excel sheet
                
* 2. Data Frame: Used to  create 2 dimensional array with row index and column index 
                * eg: table in an excel sheet

In [1]:
import pandas as pd

In [2]:
pd.__version__

'1.4.4'

In [3]:
# creating series using list

l=[10,20,30,40,50]
s=pd.Series(l)
print(s)
print(type(s))

0    10
1    20
2    30
3    40
4    50
dtype: int64
<class 'pandas.core.series.Series'>


In [4]:
# changing index

s.index=['a','b','c','d','e']
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [5]:
# create series by using tuple (100,546,89,75)

s1=pd.Series((100,546,89,75))
s1

0    100
1    546
2     89
3     75
dtype: int64

In [6]:
# creating series by using nd array

import numpy as np
a=np.array([12,54,78])
s3=pd.Series(a)
s3

0    12
1    54
2    78
dtype: int32

In [7]:
s3.index=['x','y','z']
s3

x    12
y    54
z    78
dtype: int32

In [25]:
print(s3['x'])
print(s3['y'])

12
54


In [10]:
s3['z']

78

In [11]:
s3[:]

x    12
y    54
z    78
dtype: int32

In [14]:
s3.index  # to get only index

Index(['x', 'y', 'z'], dtype='object')

In [16]:
s3.values  # to get values of series object

array([12, 54, 78])

In [19]:
# creating series by using range
s4=pd.Series(range(11,21))
s4

0    11
1    12
2    13
3    14
4    15
5    16
6    17
7    18
8    19
9    20
dtype: int64

In [21]:
s5=pd.Series([10,90,89,76],dtype=complex)
s5

0    10.0+0.0j
1    90.0+0.0j
2    89.0+0.0j
3    76.0+0.0j
dtype: complex128

In [23]:
s5=pd.Series([90,87,75,89],index=('st1','st2','st3','st4'))
s5

st1    90
st2    87
st3    75
st4    89
dtype: int64

In [26]:
# list comprehesion [o/p loop condition]

[i**2 for i in range(1,11)]

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]

In [27]:
s6=pd.Series([i**2 for i in range(1,11)])
s6

0      1
1      4
2      9
3     16
4     25
5     36
6     49
7     64
8     81
9    100
dtype: int64

### DataFrame

* Dataframe is  a two dimensional, size mutable, heterogeneous tabular data
* like a table in an excel sheet

In [28]:
# creating data frame using list

l=[10,55,47,89,96]
df=pd.DataFrame(l)
df

Unnamed: 0,0
0,10
1,55
2,47
3,89
4,96


In [31]:
# Creating dataframe by using Series
df2=pd.DataFrame(pd.Series([10,20,30,40],index=['a','b','c','d']))
df2

Unnamed: 0,0
a,10
b,20
c,30
d,40


In [32]:
nl=[[12,34,56],[67,89,54]]
df=pd.DataFrame(nl)
df

Unnamed: 0,0,1,2
0,12,34,56
1,67,89,54


In [33]:
df.index  # to get row index

RangeIndex(start=0, stop=2, step=1)

In [34]:
df.values   # to get all values from dataframe

array([[12, 34, 56],
       [67, 89, 54]], dtype=int64)

In [36]:
df.columns  # to get column index

RangeIndex(start=0, stop=3, step=1)

In [37]:
df.index=['x','y']
df

Unnamed: 0,0,1,2
x,12,34,56
y,67,89,54


In [38]:
df.columns=['col1','col2','col3']
df

Unnamed: 0,col1,col2,col3
x,12,34,56
y,67,89,54


In [48]:
d={'emp':pd.Series(['emp1','emp2','emp3'],index=[1,2,3]),
  'year':pd.Series([2000,2002,2003],index=[1,2,3])}
df1=pd.DataFrame(d)
df1

Unnamed: 0,emp,year
1,emp1,2000
2,emp2,2002
3,emp3,2003


In [49]:
d2={'emp':pd.Series(['emp1','emp2','emp4'],index=[1,2,3]),
  'dept':pd.Series(['op','hr','finance'],index=[1,2,3])}
df2=pd.DataFrame(d2)
df2

Unnamed: 0,emp,dept
1,emp1,op
2,emp2,hr
3,emp4,finance


In [50]:
# combining two dataframes

pd.merge(df1,df2)

Unnamed: 0,emp,year,dept
0,emp1,2000,op
1,emp2,2002,hr


In [51]:
pd.merge(df1,df2,how="left")

Unnamed: 0,emp,year,dept
0,emp1,2000,op
1,emp2,2002,hr
2,emp3,2003,


In [52]:
pd.merge(df1,df2,how="right")

Unnamed: 0,emp,year,dept
0,emp1,2000.0,op
1,emp2,2002.0,hr
2,emp4,,finance


In [80]:
# creating dictionary for student

s={'name':['ganesh','akash','kumar'],
  'rollno':[10,11,12],
  'branch':['IT','CSE','EEE']}

df=pd.DataFrame(s)
df

Unnamed: 0,name,rollno,branch
0,ganesh,10,IT
1,akash,11,CSE
2,kumar,12,EEE


In [81]:
# to get columns

df.columns

Index(['name', 'rollno', 'branch'], dtype='object')

In [82]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [83]:
df.values

array([['ganesh', 10, 'IT'],
       ['akash', 11, 'CSE'],
       ['kumar', 12, 'EEE']], dtype=object)

In [84]:
# to get top 2 records
df.head(2)

Unnamed: 0,name,rollno,branch
0,ganesh,10,IT
1,akash,11,CSE


In [85]:
# to get bottom 2 records

df.tail(2)

Unnamed: 0,name,rollno,branch
1,akash,11,CSE
2,kumar,12,EEE


In [86]:
# to get no of rows, columns

df.shape

(3, 3)

In [87]:
df

Unnamed: 0,name,rollno,branch
0,ganesh,10,IT
1,akash,11,CSE
2,kumar,12,EEE


In [88]:
df['name']  # access names column

0    ganesh
1     akash
2     kumar
Name: name, dtype: object

In [89]:
df['rollno']

0    10
1    11
2    12
Name: rollno, dtype: int64

In [90]:
df[['name','rollno']]

Unnamed: 0,name,rollno
0,ganesh,10
1,akash,11
2,kumar,12


In [91]:
# to add new column marks

df['marks']=[97,85,78]
df

Unnamed: 0,name,rollno,branch,marks
0,ganesh,10,IT,97
1,akash,11,CSE,85
2,kumar,12,EEE,78


In [92]:
# indexing 

# iloc -> integer based indexing
# loc -> both integer and string based indexing

In [93]:
df

Unnamed: 0,name,rollno,branch,marks
0,ganesh,10,IT,97
1,akash,11,CSE,85
2,kumar,12,EEE,78


In [94]:
df.iloc[1,1]

11

In [95]:
df.loc[df['name']=='ganesh']

Unnamed: 0,name,rollno,branch,marks
0,ganesh,10,IT,97


In [96]:
# to add new record

df.loc[3]=['Mahesh',14,'EEE',88]
df

Unnamed: 0,name,rollno,branch,marks
0,ganesh,10,IT,97
1,akash,11,CSE,85
2,kumar,12,EEE,78
3,Mahesh,14,EEE,88


In [97]:
# to get row with index 2

df.iloc[2]

name      kumar
rollno       12
branch      EEE
marks        78
Name: 2, dtype: object

In [98]:
# to update particular value

df.loc[2,'marks']=98
df

Unnamed: 0,name,rollno,branch,marks
0,ganesh,10,IT,97
1,akash,11,CSE,85
2,kumar,12,EEE,98
3,Mahesh,14,EEE,88


In [99]:
# rename a particular column

df.rename(columns={'branch':'Branch'},inplace=True)
df

Unnamed: 0,name,rollno,Branch,marks
0,ganesh,10,IT,97
1,akash,11,CSE,85
2,kumar,12,EEE,98
3,Mahesh,14,EEE,88


In [100]:
# to  rename all the columns
df.columns=['Name','Rollno','Branch','Marks']
df

Unnamed: 0,Name,Rollno,Branch,Marks
0,ganesh,10,IT,97
1,akash,11,CSE,85
2,kumar,12,EEE,98
3,Mahesh,14,EEE,88


In [101]:
# to change row index

df.index=['a','b','c','d']
df

Unnamed: 0,Name,Rollno,Branch,Marks
a,ganesh,10,IT,97
b,akash,11,CSE,85
c,kumar,12,EEE,98
d,Mahesh,14,EEE,88


In [102]:
# to delete data from a dataframe

# drop
# axis=0 -> row based
# axis=1 -> column based


In [103]:

# to delete 'a' index record
df.drop('a',axis=0,inplace=True)
df

Unnamed: 0,Name,Rollno,Branch,Marks
b,akash,11,CSE,85
c,kumar,12,EEE,98
d,Mahesh,14,EEE,88


In [105]:
# to delete marks column

df.drop("Marks",axis=1,inplace=True)
df

Unnamed: 0,Name,Rollno,Branch
b,akash,11,CSE
c,kumar,12,EEE
d,Mahesh,14,EEE


In [106]:
# to delete all the columns

df.drop(df.columns,axis=1,inplace=True)
df

b
c
d


### File I/O

In [111]:
df=pd.read_csv("C://Users//meena//Downloads//Salary_Data.csv")
df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0


In [112]:
df.shape

(6704, 6)

In [114]:
df.index

RangeIndex(start=0, stop=6704, step=1)

In [115]:
df.columns

Index(['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience',
       'Salary'],
      dtype='object')

In [116]:
df.values

array([[32.0, 'Male', "Bachelor's", 'Software Engineer', 5.0, 90000.0],
       [28.0, 'Female', "Master's", 'Data Analyst', 3.0, 65000.0],
       [45.0, 'Male', 'PhD', 'Senior Manager', 15.0, 150000.0],
       ...,
       [30.0, 'Female', "Bachelor's Degree", 'Financial Manager', 4.0,
        55000.0],
       [46.0, 'Male', "Master's Degree", 'Marketing Manager', 14.0,
        140000.0],
       [26.0, 'Female', 'High School', 'Sales Executive', 1.0, 35000.0]],
      dtype=object)

In [117]:
df.describe()

Unnamed: 0,Age,Years of Experience,Salary
count,6702.0,6701.0,6699.0
mean,33.620859,8.094687,115326.964771
std,7.614633,6.059003,52786.183911
min,21.0,0.0,350.0
25%,28.0,3.0,70000.0
50%,32.0,7.0,115000.0
75%,38.0,12.0,160000.0
max,62.0,34.0,250000.0


In [118]:
# to get count of males,females and others

df["Gender"].value_counts()

Male      3674
Female    3014
Other       14
Name: Gender, dtype: int64

In [119]:
# to get the no of job titles

df["Job Title"].value_counts()

Software Engineer             518
Data Scientist                453
Software Engineer Manager     376
Data Analyst                  363
Senior Project Engineer       318
                             ... 
Account Manager                 1
Help Desk Analyst               1
Senior Training Specialist      1
Junior Web Designer             1
Supply Chain Analyst            1
Name: Job Title, Length: 193, dtype: int64

In [120]:
# to get the records of data scientist

df[df["Job Title"]=="Data Scientist"]

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
32,29.0,Male,Master's,Data Scientist,3.0,75000.0
375,31.0,Male,Master's,Data Scientist,6.0,160000.0
379,30.0,Male,PhD,Data Scientist,5.0,180000.0
383,33.0,Male,PhD,Data Scientist,8.0,190000.0
387,27.0,Male,PhD,Data Scientist,2.0,115000.0
...,...,...,...,...,...,...
6202,32.0,Female,PhD,Data Scientist,9.0,145000.0
6216,32.0,Female,PhD,Data Scientist,9.0,145000.0
6230,32.0,Female,PhD,Data Scientist,9.0,145000.0
6244,32.0,Female,PhD,Data Scientist,9.0,145000.0


In [121]:
# to update any value

df.loc[1,"Education Level"]='MASTERS'
df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,MASTERS,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0


In [122]:
# to get maximum salary records

df[df['Salary']==max(df["Salary"])]

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
30,50.0,Male,Bachelor's,CEO,25.0,250000.0
83,52.0,Male,PhD,Chief Technology Officer,24.0,250000.0
5001,45.0,Male,Bachelor's Degree,Financial Manager,21.0,250000.0


In [124]:
# sorting values based on salary

df.sort_values('Salary',ascending=False)

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
83,52.0,Male,PhD,Chief Technology Officer,24.0,250000.0
30,50.0,Male,Bachelor's,CEO,25.0,250000.0
5001,45.0,Male,Bachelor's Degree,Financial Manager,21.0,250000.0
4306,51.0,Male,PhD,Data Scientist,24.0,240000.0
4366,51.0,Male,PhD,Data Scientist,24.0,240000.0
...,...,...,...,...,...,...
172,,,,,,
260,,,,,,
3136,31.0,Male,Master's Degree,Full Stack Engineer,8.0,
5247,26.0,Female,Bachelor's Degree,Social M,,


In [125]:
df['Job Title'].unique()

array(['Software Engineer', 'Data Analyst', 'Senior Manager',
       'Sales Associate', 'Director', 'Marketing Analyst',
       'Product Manager', 'Sales Manager', 'Marketing Coordinator',
       'Senior Scientist', 'Software Developer', 'HR Manager',
       'Financial Analyst', 'Project Manager', 'Customer Service Rep',
       'Operations Manager', 'Marketing Manager', 'Senior Engineer',
       'Data Entry Clerk', 'Sales Director', 'Business Analyst',
       'VP of Operations', 'IT Support', 'Recruiter', 'Financial Manager',
       'Social Media Specialist', 'Software Manager', 'Junior Developer',
       'Senior Consultant', 'Product Designer', 'CEO', 'Accountant',
       'Data Scientist', 'Marketing Specialist', 'Technical Writer',
       'HR Generalist', 'Project Engineer', 'Customer Success Rep',
       'Sales Executive', 'UX Designer', 'Operations Director',
       'Network Engineer', 'Administrative Assistant',
       'Strategy Consultant', 'Copywriter', 'Account Manager',
      

In [126]:
len(df['Job Title'].unique())

194

In [127]:
# statistics

df.max()

  df.max()


Age                        62.0
Years of Experience        34.0
Salary                 250000.0
dtype: float64

In [128]:
df.min()

  df.min()


Age                     21.0
Years of Experience      0.0
Salary                 350.0
dtype: float64

In [129]:
df.std()

  df.std()


Age                        7.614633
Years of Experience        6.059003
Salary                 52786.183911
dtype: float64

In [130]:
df.mean()

  df.mean()


Age                        33.620859
Years of Experience         8.094687
Salary                 115326.964771
dtype: float64

In [132]:
df.head()  # top 5 rows

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,MASTERS,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [133]:
df.head(2)

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,MASTERS,Data Analyst,3.0,65000.0


In [135]:
df.tail()  # bottom 5 rows

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0
6703,26.0,Female,High School,Sales Executive,1.0,35000.0


### Data cleaning using pandas

* Nan -> Not a number
* To deal with duplicates and missing values
* isnull()
* notnull()
* fillna()
* replace()

In [136]:
emp=pd.read_csv("C://Users//meena//Downloads//employe.csv")
emp

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.170,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.340,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,11/23/2014,6:09 AM,132483,16.655,False,Distribution
996,Phillip,Male,1/31/1984,6:30 AM,42392,19.675,False,Finance
997,Russell,Male,5/20/2013,12:39 PM,96914,1.421,False,Product
998,Larry,Male,4/20/2013,4:45 PM,60500,11.985,False,Business Development


In [137]:
emp.shape

(1000, 8)

In [138]:
emp.head()


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services


In [139]:
emp.isnull()  # detect the missing value

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
995,False,True,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False


In [140]:
# total no of missing values

emp.isnull().sum()

First Name            67
Gender               145
Start Date             0
Last Login Time        0
Salary                 0
Bonus %                0
Senior Management     67
Team                  43
dtype: int64

In [141]:
emp.notnull().sum()

First Name            933
Gender                855
Start Date           1000
Last Login Time      1000
Salary               1000
Bonus %              1000
Senior Management     933
Team                  957
dtype: int64

In [142]:
emp.dropna()  # removes rows that contains null values

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.340,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services
5,Dennis,Male,4/18/1987,1:35 AM,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,6/21/2013,5:47 PM,98874,4.479,True,Marketing
996,Phillip,Male,1/31/1984,6:30 AM,42392,19.675,False,Finance
997,Russell,Male,5/20/2013,12:39 PM,96914,1.421,False,Product
998,Larry,Male,4/20/2013,4:45 PM,60500,11.985,False,Business Development


In [143]:
emp.dropna(axis=0)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.340,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services
5,Dennis,Male,4/18/1987,1:35 AM,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,6/21/2013,5:47 PM,98874,4.479,True,Marketing
996,Phillip,Male,1/31/1984,6:30 AM,42392,19.675,False,Finance
997,Russell,Male,5/20/2013,12:39 PM,96914,1.421,False,Product
998,Larry,Male,4/20/2013,4:45 PM,60500,11.985,False,Business Development


In [144]:
emp.dropna(axis=1)

Unnamed: 0,Start Date,Last Login Time,Salary,Bonus %
0,8/6/1993,12:42 PM,97308,6.945
1,3/31/1996,6:53 AM,61933,4.170
2,4/23/1993,11:17 AM,130590,11.858
3,3/4/2005,1:00 PM,138705,9.340
4,1/24/1998,4:47 PM,101004,1.389
...,...,...,...,...
995,11/23/2014,6:09 AM,132483,16.655
996,1/31/1984,6:30 AM,42392,19.675
997,5/20/2013,12:39 PM,96914,1.421
998,4/20/2013,4:45 PM,60500,11.985


In [145]:
emp.dropna().sum()

First Name           DouglasMariaJerryLarryDennisRubyAngelaFrancesJ...
Gender               MaleFemaleMaleMaleMaleFemaleFemaleFemaleFemale...
Start Date           8/6/19934/23/19933/4/20051/24/19984/18/19878/1...
Last Login Time      12:42 PM11:17 AM1:00 PM4:47 PM1:35 AM4:20 PM6:...
Salary                                                        69090962
Bonus %                                                       7753.103
Senior Management                                                  381
Team                 MarketingFinanceFinanceClient ServicesLegalPro...
dtype: object

In [147]:
emp['Gender'].isnull()

0      False
1      False
2      False
3      False
4      False
       ...  
995     True
996    False
997    False
998    False
999    False
Name: Gender, Length: 1000, dtype: bool

In [146]:
emp['Gender'].fillna("No Gender")   # fills the null values with user specific value

0           Male
1           Male
2         Female
3           Male
4           Male
         ...    
995    No Gender
996         Male
997         Male
998         Male
999         Male
Name: Gender, Length: 1000, dtype: object

In [148]:
emp['Gender'].fillna(0)

0        Male
1        Male
2      Female
3        Male
4        Male
        ...  
995         0
996      Male
997      Male
998      Male
999      Male
Name: Gender, Length: 1000, dtype: object

In [149]:
# replace

emp.replace(to_replace="Male",value='MALE')


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,MALE,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,MALE,3/31/1996,6:53 AM,61933,4.170,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,MALE,3/4/2005,1:00 PM,138705,9.340,True,Finance
4,Larry,MALE,1/24/1998,4:47 PM,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,11/23/2014,6:09 AM,132483,16.655,False,Distribution
996,Phillip,MALE,1/31/1984,6:30 AM,42392,19.675,False,Finance
997,Russell,MALE,5/20/2013,12:39 PM,96914,1.421,False,Product
998,Larry,MALE,4/20/2013,4:45 PM,60500,11.985,False,Business Development


In [150]:
emp.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [151]:
emp.drop_duplicates()  # removes the duplicate rows

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.170,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.340,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,11/23/2014,6:09 AM,132483,16.655,False,Distribution
996,Phillip,Male,1/31/1984,6:30 AM,42392,19.675,False,Finance
997,Russell,Male,5/20/2013,12:39 PM,96914,1.421,False,Product
998,Larry,Male,4/20/2013,4:45 PM,60500,11.985,False,Business Development
