In [6]:
import pandas as pd

# Series = A Pandas 1-Dimensional labelled array that can hold any data type 
#          Think of it as a single column in a spreadsheet (1-Dimensional)

data = [100, 102, 103, 104, 105]

series = pd.Series(data)    
print(series)

data = [100.1, 102.4, 103.4, 104.7, 105.8]
series = pd.Series(data)
print(series)

data = ['A', 'B', 'C', 'D', 'E']
series = pd.Series(data)
print(series)

data = [True, False, True, False, True] 
series = pd.Series(data)
print(series)

0    100
1    102
2    103
3    104
4    105
dtype: int64
0    100.1
1    102.4
2    103.4
3    104.7
4    105.8
dtype: float64
0    A
1    B
2    C
3    D
4    E
dtype: object
0     True
1    False
2     True
3    False
4     True
dtype: bool


In [8]:

data = [100, 102, 103, 104, 105]

series = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])
print(series)

a    100
b    102
c    103
d    104
e    105
dtype: int64


In [9]:
data = [100, 102, 103, 104, 105]

series = pd.Series(data, index=['appartment1', 'appartement2', 'appartement3', 'appartement4', 'appartement5'])
print(series)

appartment1     100
appartement2    102
appartement3    103
appartement4    104
appartement5    105
dtype: int64


In [13]:
data = [100, 102, 103, 104, 105]

series = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])

print(series.loc['a']) # Accessing the value at index 'a'
print(series.loc['c']) # Accessing the value at index 'c'
print(series.loc['e']) # Accessing the value at index 'e'
print( )
print(series.iloc[0]) # Accessing the value at position 0
print(series.iloc[2]) # Accessing the value at position 2
print(series.iloc[4]) # Accessing the value at position 4

100
103
105

100
103
105


In [15]:
data = [100, 102, 103, 204, 205]

series = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])

print(series[series >= 200]) 
# Filtering values greater than or equal to 200    

print(series[series < 200]) 
# Filtering values less than 200


d    204
e    205
dtype: int64
a    100
b    102
c    103
dtype: int64


In [19]:
# Using a dictionary to create a Series
import pandas as pd

calories = {"Day 1": 1250,
            "Day 2": 2100,
            "Day 3": 1800,}

series = pd.Series(calories)

print(series)

print(series.loc['Day 1']) # Accessing the value for 'Day 1'
print(series.loc['Day 2']) # Accessing the value for 'Day 2'

# Updating Day 3s calories
series.loc['Day 3'] += 500

print(series)


Day 1    1250
Day 2    2100
Day 3    1800
dtype: int64
1250
2100
Day 1    1250
Day 2    2100
Day 3    2300
dtype: int64


In [None]:

calories = {"Day 1": 1250,
            "Day 2": 2100,
            "Day 3": 1800,}

series = pd.Series(calories)

print(series[series > 2000]) 
# Filtering values greater than 2000

print(series[series <= 2000])
# Filtering values less than or equal to 2000

Day 2    2100
dtype: int64
Day 1    1250
Day 3    1800
dtype: int64


# DataFrame

In [6]:
import pandas as pd

# DataFrame = A tabular data structure with rows And columns. (2-Dimensional)
#             Similar to an Excel spreadsheet or SQL table.

data = {"Name": ["Spongebob", "Patrick", "Sandy"],
        "Age": [20, 21, 19],
}

df = pd.DataFrame(data)
# A DataFrame can be created from a dictionary where the keys are the column names and the values are lists of data for each column.
# Creating a DataFrame from a dictionary

print(df)

        Name  Age
0  Spongebob   20
1    Patrick   21
2      Sandy   19


In [None]:
data = {"Name": ["Spongebob", "Patrick", "Sandy"],
        "Age": [20, 21, 19],
}

df = pd.DataFrame(data, index=["Employee 1", "Employee 2", "Employee 3"])
# A DataFrame can also be created with a custom index, which allows you to label the rows with specific names instead of default integer indices.

print(df.loc['Employee 1']) # Accessing the row for 'Employee 1'
print(df.loc['Employee 2']) # Accessing the row for 'Employee 2'
print( )
print(df.iloc[0]) # Accessing the first row using integer location
print(df.iloc[1]) # Accessing the second row using integer location

df["Job"] = ["Cook", "N/A", "Cashier"]
# Adding a new column 'Job' to the DataFrame with the specified values.

new_row = pd.DataFrame([{"Name": "Sandy", "Age": 20, "Job": "Scientist"}],
                       index=["Employee 4"])
# Creating a new DataFrame for the new row to be added, with the same columns as the original DataFrame and a custom index.

df = pd.concat([df, new_row])
# Concatenating the original DataFrame with the new row DataFrame to create an updated DataFrame that includes the new row.
print()

print(df)
# Displaying the updated DataFrame with the new row added.

Name    Spongebob
Age            20
Name: Employee 1, dtype: object
Name    Patrick
Age          21
Name: Employee 2, dtype: object

Name    Spongebob
Age            20
Name: Employee 1, dtype: object
Name    Patrick
Age          21
Name: Employee 2, dtype: object

                 Name  Age        Job
Employee 1  Spongebob   20       Cook
Employee 2    Patrick   21        N/A
Employee 3      Sandy   19    Cashier
Employee 4      Sandy   20  Scientist


# Importing 

In [16]:
import pandas as pd

df = pd.read_csv("PokemonData.csv")
# Reading a CSV file named "PokemonData.csv" into a DataFrame called 'df' using the read_csv function from the pandas library.
print(df)

     Num                   Name    Type1   Type2  HP  Attack  Defense  SpAtk  \
0      1              Bulbasaur    Grass  Poison  45      49       49     65   
1      2                Ivysaur    Grass  Poison  60      62       63     80   
2      3               Venusaur    Grass  Poison  80      82       83    100   
3      3  VenusaurMega Venusaur    Grass  Poison  80     100      123    122   
4      4             Charmander     Fire     NaN  39      52       43     60   
..   ...                    ...      ...     ...  ..     ...      ...    ...   
795  719                Diancie     Rock   Fairy  50     100      150    100   
796  719    DiancieMega Diancie     Rock   Fairy  50     160      110    160   
797  720    HoopaHoopa Confined  Psychic   Ghost  80     110       60    150   
798  720     HoopaHoopa Unbound  Psychic    Dark  80     160       60    170   
799  721              Volcanion     Fire   Water  80     110      120    130   

     SpDef  Speed  Generation  Legendar

In [None]:
# dropping a column from the DataFrame, we can use the drop method and specify the column name and axis=1 to indicate that we want to drop a column (as opposed to a row which would be axis=0).
df = df.drop("spDef", axis=1)
df = df.drop("spAtk", axis=1)
df = df.drop("speed", axis=1)
df = df.drop("generation", axis=1)
df = df.drop("Defense", axis=1)
# The above lines of code are dropping the specified columns from the DataFrame 'df' using the drop method. Each line drops a different column based on its name.

print(df)

     Num                   Name    Type1   Type2  HP  Attack  Legendary
0      1              Bulbasaur    Grass  Poison  45      49      False
1      2                Ivysaur    Grass  Poison  60      62      False
2      3               Venusaur    Grass  Poison  80      82      False
3      3  VenusaurMega Venusaur    Grass  Poison  80     100      False
4      4             Charmander     Fire     NaN  39      52      False
..   ...                    ...      ...     ...  ..     ...        ...
795  719                Diancie     Rock   Fairy  50     100       True
796  719    DiancieMega Diancie     Rock   Fairy  50     160       True
797  720    HoopaHoopa Confined  Psychic   Ghost  80     110       True
798  720     HoopaHoopa Unbound  Psychic    Dark  80     160       True
799  721              Volcanion     Fire   Water  80     110       True

[800 rows x 7 columns]


In [28]:
print(df.to_string())

     Num                       Name     Type1     Type2   HP  Attack  Defense  SpAtk  SpDef  Speed  Generation  Legendary
0      1                  Bulbasaur     Grass    Poison   45      49       49     65     65     45           1      False
1      2                    Ivysaur     Grass    Poison   60      62       63     80     80     60           1      False
2      3                   Venusaur     Grass    Poison   80      82       83    100    100     80           1      False
3      3      VenusaurMega Venusaur     Grass    Poison   80     100      123    122    120     80           1      False
4      4                 Charmander      Fire       NaN   39      52       43     60     50     65           1      False
5      5                 Charmeleon      Fire       NaN   58      64       58     80     65     80           1      False
6      6                  Charizard      Fire    Flying   78      84       78    109     85    100           1      False
7      6  CharizardMega 

# Selection

In [31]:
import pandas as pd


print(df["Name"])
# The above line of code is printing the "Name" column from the DataFrame 'df'. It accesses the column using the column name as a key, which returns a Series containing the values of that column.
print()

print(df["Name"].to_string())
# The above line of code is printing the "Name" column from the DataFrame 'df' as a string. The to_string() method is used to convert the Series into a string representation, which allows for better formatting when printing.


0                  Bulbasaur
1                    Ivysaur
2                   Venusaur
3      VenusaurMega Venusaur
4                 Charmander
               ...          
795                  Diancie
796      DiancieMega Diancie
797      HoopaHoopa Confined
798       HoopaHoopa Unbound
799                Volcanion
Name: Name, Length: 800, dtype: object

0                      Bulbasaur
1                        Ivysaur
2                       Venusaur
3          VenusaurMega Venusaur
4                     Charmander
5                     Charmeleon
6                      Charizard
7      CharizardMega Charizard X
8      CharizardMega Charizard Y
9                       Squirtle
10                     Wartortle
11                     Blastoise
12       BlastoiseMega Blastoise
13                      Caterpie
14                       Metapod
15                    Butterfree
16                        Weedle
17                        Kakuna
18                      Beedrill
19         Bee

In [None]:
print(df["Attack"].to_string())
# The above line of code is printing the "Attack" column from the DataFrame 'df' as a string. Similar to the previous line, it uses the to_string() method to convert the Series into a string representation for better formatting when printing.

0       49
1       62
2       82
3      100
4       52
5       64
6       84
7      130
8      104
9       48
10      63
11      83
12     103
13      30
14      20
15      45
16      35
17      25
18      90
19     150
20      45
21      60
22      80
23      80
24      56
25      81
26      60
27      90
28      60
29      85
30      55
31      90
32      75
33     100
34      47
35      62
36      92
37      57
38      72
39     102
40      45
41      70
42      41
43      76
44      45
45      70
46      45
47      80
48      50
49      65
50      80
51      70
52      95
53      55
54      65
55      55
56      80
57      45
58      70
59      52
60      82
61      80
62     105
63      70
64     110
65      50
66      65
67      95
68      20
69      35
70      50
71      50
72      80
73     100
74     130
75      75
76      90
77     105
78      40
79      70
80      80
81      95
82     120
83      85
84     100
85      65
86      75
87      75
88      35
89      60
90      65

In [None]:
print(df[["Name", "Attack", "HP"]].to_string())
# The above line of code is printing a subset of the DataFrame 'df' that includes only the "Name", "Attack", and "HP" columns. The to_string() method is used to convert this subset of the DataFrame into a string representation for better formatting when printing.

                          Name  Attack   HP
0                    Bulbasaur      49   45
1                      Ivysaur      62   60
2                     Venusaur      82   80
3        VenusaurMega Venusaur     100   80
4                   Charmander      52   39
5                   Charmeleon      64   58
6                    Charizard      84   78
7    CharizardMega Charizard X     130   78
8    CharizardMega Charizard Y     104   78
9                     Squirtle      48   44
10                   Wartortle      63   59
11                   Blastoise      83   79
12     BlastoiseMega Blastoise     103   79
13                    Caterpie      30   45
14                     Metapod      20   50
15                  Butterfree      45   60
16                      Weedle      35   40
17                      Kakuna      25   45
18                    Beedrill      90   65
19       BeedrillMega Beedrill     150   65
20                      Pidgey      45   40
21                   Pidgeotto  

In [35]:
# Selection by Rows

print(df.loc[0])
# The above line of code is printing the first row of the DataFrame 'df' using the loc indexer. It accesses the row at index 0, which corresponds to the first row of the DataFrame.
print()
print(df.loc[1])

Num                   1
Name          Bulbasaur
Type1             Grass
Type2            Poison
HP                   45
Attack               49
Defense              49
SpAtk                65
SpDef                65
Speed                45
Generation            1
Legendary         False
Name: 0, dtype: object

Num                 2
Name          Ivysaur
Type1           Grass
Type2          Poison
HP                 60
Attack             62
Defense            63
SpAtk              80
SpDef              80
Speed              60
Generation          1
Legendary       False
Name: 1, dtype: object


In [38]:
df = pd.read_csv("PokemonData.csv", index_col="Name")
# Reading the CSV file "PokemonData.csv" into a DataFrame called 'df' and setting the "Name" column as the index of the DataFrame using the index_col parameter. This allows for easier access to rows based on the Pokemon names.

print(df.loc["Pikachu"])
print()
print(df.loc["Bulbasaur"])


Num                 25
Type1         Electric
Type2              NaN
HP                  35
Attack              55
Defense             40
SpAtk               50
SpDef               50
Speed               90
Generation           1
Legendary        False
Name: Pikachu, dtype: object

Num                1
Type1          Grass
Type2         Poison
HP                45
Attack            49
Defense           49
SpAtk             65
SpDef             65
Speed             45
Generation         1
Legendary      False
Name: Bulbasaur, dtype: object


In [None]:
print(df.loc["Charizard", ["Speed", "Type1"]])
# The above line of code is printing the values of the "Speed" and "Type1" columns for the row corresponding to "Charizard" in the DataFrame 'df'. It uses the loc indexer to access the specific row and columns based on their labels.

print(df.loc["Charizard" : "Zapdos", ["Speed", "Type1"]])
# The above line of code is printing the values of the "Speed" and "Type1" columns for the rows corresponding to "Charizard" through "Zapdos" in the DataFrame 'df'. It uses the loc indexer to access a range of rows based on their labels and the specified columns.


Speed     100
Type1    Fire
Name: Charizard, dtype: object
                           Speed     Type1
Name                                      
Charizard                    100      Fire
CharizardMega Charizard X    100      Fire
CharizardMega Charizard Y    100      Fire
Squirtle                      43     Water
Wartortle                     58     Water
...                          ...       ...
Aerodactyl                   130      Rock
AerodactylMega Aerodactyl    150      Rock
Snorlax                       30    Normal
Articuno                      85       Ice
Zapdos                       100  Electric

[152 rows x 2 columns]


In [None]:
print(df.iloc[0:11])
# This prints the first 11 rows of the DataFrame 'df' using the iloc indexer, which accesses rows based on their integer location. The range 0:11 includes rows from index 0 to index 10 (inclusive).


                           Num  Type1   Type2  HP  Attack  Defense  SpAtk  \
Name                                                                        
Bulbasaur                    1  Grass  Poison  45      49       49     65   
Ivysaur                      2  Grass  Poison  60      62       63     80   
Venusaur                     3  Grass  Poison  80      82       83    100   
VenusaurMega Venusaur        3  Grass  Poison  80     100      123    122   
Charmander                   4   Fire     NaN  39      52       43     60   
Charmeleon                   5   Fire     NaN  58      64       58     80   
Charizard                    6   Fire  Flying  78      84       78    109   
CharizardMega Charizard X    6   Fire  Dragon  78     130      111    130   
CharizardMega Charizard Y    6   Fire  Flying  78     104       78    159   
Squirtle                     7  Water     NaN  44      48       65     50   
Wartortle                    8  Water     NaN  59      63       80     65   

In [43]:
print(df.iloc[0:11: 2]) 
# This prints every second row from the first 11 rows of the DataFrame 'df' using the iloc indexer. The range 0:11 includes rows from index 0 to index 10 (inclusive), and the step of 2 means it will print rows at indices 0, 2, 4, 6, and 8.

                           Num  Type1   Type2  HP  Attack  Defense  SpAtk  \
Name                                                                        
Bulbasaur                    1  Grass  Poison  45      49       49     65   
Venusaur                     3  Grass  Poison  80      82       83    100   
Charmander                   4   Fire     NaN  39      52       43     60   
Charizard                    6   Fire  Flying  78      84       78    109   
CharizardMega Charizard Y    6   Fire  Flying  78     104       78    159   
Wartortle                    8  Water     NaN  59      63       80     65   

                           SpDef  Speed  Generation  Legendary  
Name                                                            
Bulbasaur                     65     45           1      False  
Venusaur                     100     80           1      False  
Charmander                    50     65           1      False  
Charizard                     85    100           1      F

In [45]:
pokemon = input("Enter a pokemon name: ")

try:
    print(df.loc[pokemon])
    # The try block attempts to access and print the row corresponding to the user-inputted Pokemon name using the loc indexer. 
    # If the Pokemon name exists in the DataFrame, it will print the details of that Pokemon.

except KeyError:
    # If the Pokemon name does not exist in the DataFrame, a KeyError will be raised, and the except block will catch that error and execute the code within it.
    print(f"{pokemon} not found in the DataFrame.")



Num              143
Type1         Normal
Type2            NaN
HP               160
Attack           110
Defense           65
SpAtk             65
SpDef            110
Speed             30
Generation         1
Legendary      False
Name: Snorlax, dtype: object


# Filtering

In [53]:
import pandas as pd

df = pd.read_csv("PokemonData.csv")

# Filtering = Keeping he rows that match a condition

strong_pokemon = df[df["Attack"] > 180]
# The above line of code is creating a new DataFrame called 'strong_pokemon' that contains only the rows from the original DataFrame 'df' where the value in the "Attack" column is greater than 100. This is done using boolean indexing, where the condition df["Attack"] > 100 returns a boolean Series that is used to filter the rows of the DataFrame.

print(strong_pokemon)

     Num                     Name    Type1     Type2   HP  Attack  Defense  \
163  150      MewtwoMega Mewtwo X  Psychic  Fighting  106     190      100   
232  214  HeracrossMega Heracross      Bug  Fighting   80     185      115   

     SpAtk  SpDef  Speed  Generation  Legendary  
163    154    100    130           1       True  
232     40    105     75           2      False  


In [54]:
speedy_pokemon = df[df["Speed"] > 150]
# The above line of code is creating a new DataFrame called 'speedy_pokemon' that contains only the rows from the original DataFrame 'df' where the value in the "Speed" column is greater than 150. Similar to the previous line, it uses boolean indexing to filter the rows based on the specified condition.

print(speedy_pokemon)

     Num               Name    Type1   Type2  HP  Attack  Defense  SpAtk  \
315  291            Ninjask      Bug  Flying  61      90       45     50   
431  386  DeoxysSpeed Forme  Psychic     NaN  50      95       90     95   

     SpDef  Speed  Generation  Legendary  
315     50    160           3      False  
431     90    180           3       True  


In [76]:
f = df[df["HP"] == 1]
print(f)

     Num      Name Type1  Type2  HP  Attack  Defense  SpAtk  SpDef  Speed  \
316  292  Shedinja   Bug  Ghost   1      90       45     30     30     40   

     Generation  Legendary  
316           3      False  


In [58]:
legendary_pokemon = df[df["Legendary"] == True]
# This contains only the rows from the original DataFrame 'df' where the value in the "Legendary" column is True. This filters the DataFrame to include only the legendary Pokemon based on the boolean condition.
print(legendary_pokemon)

     Num                 Name     Type1     Type2   HP  Attack  Defense  \
156  144             Articuno       Ice    Flying   90      85      100   
157  145               Zapdos  Electric    Flying   90      90       85   
158  146              Moltres      Fire    Flying   90     100       90   
162  150               Mewtwo   Psychic       NaN  106     110       90   
163  150  MewtwoMega Mewtwo X   Psychic  Fighting  106     190      100   
..   ...                  ...       ...       ...  ...     ...      ...   
795  719              Diancie      Rock     Fairy   50     100      150   
796  719  DiancieMega Diancie      Rock     Fairy   50     160      110   
797  720  HoopaHoopa Confined   Psychic     Ghost   80     110       60   
798  720   HoopaHoopa Unbound   Psychic      Dark   80     160       60   
799  721            Volcanion      Fire     Water   80     110      120   

     SpAtk  SpDef  Speed  Generation  Legendary  
156     95    125     85           1       True  

In [59]:
water_pokemon = df[df["Type1"] == "Water"]
# This line of code is creating a new DataFrame called 'water_pokemon' that contains only the rows where the value in the "Type1" column is "Water". This filters the DataFrame to include only the Pokemon that have "Water" as their primary type.
print(water_pokemon)

     Num                     Name  Type1 Type2  HP  Attack  Defense  SpAtk  \
9      7                 Squirtle  Water   NaN  44      48       65     50   
10     8                Wartortle  Water   NaN  59      63       80     65   
11     9                Blastoise  Water   NaN  79      83      100     85   
12     9  BlastoiseMega Blastoise  Water   NaN  79     103      120    135   
59    54                  Psyduck  Water   NaN  50      52       48     65   
..   ...                      ...    ...   ...  ..     ...      ...    ...   
724  656                  Froakie  Water   NaN  41      56       40     62   
725  657                Frogadier  Water   NaN  54      63       52     83   
726  658                 Greninja  Water  Dark  72      95       67    103   
762  692                Clauncher  Water   NaN  50      53       62     58   
763  693                Clawitzer  Water   NaN  71      73       88    120   

     SpDef  Speed  Generation  Legendary  
9       64     43   

In [None]:
fire_pokemon = df[(df["Type1"] == "Fire") & (df["Type2"] == "Flying")]
# fire_pokemon is a new DataFrame that contains only the rows from the original DataFrame 'df' where the value in the "Type1" column is "Fire" and the value in the "Type2" column is "Flying".
# This filters the DataFrame to include only the Pokemon that have "Fire" as their primary type and "Flying" as their secondary type.
print(fire_pokemon)

     Num                       Name Type1   Type2   HP  Attack  Defense  \
6      6                  Charizard  Fire  Flying   78      84       78   
8      6  CharizardMega Charizard Y  Fire  Flying   78     104       78   
158  146                    Moltres  Fire  Flying   90     100       90   
270  250                      Ho-oh  Fire  Flying  106     130       90   
730  662                Fletchinder  Fire  Flying   62      73       55   
731  663                 Talonflame  Fire  Flying   78      81       71   

     SpAtk  SpDef  Speed  Generation  Legendary  
6      109     85    100           1      False  
8      159    115    100           1      False  
158    125     85     90           1       True  
270    110    154     90           2       True  
730     56     52     84           6      False  
731     74     69    126           6      False  


# Aggregation

In [64]:
# aggregate funtions = Reduces a set of values down to a single value
#                      Used to summarize and analyze data
#                      Often used with the groupby() function


In [None]:
import pandas as pd

df = pd.read_csv("PokemonData.csv")

print(df.mean())
# This causes an error because the mean() function cannot be applied to non-numeric columns in the DataFrame. The DataFrame likely contains columns with string or categorical data, which cannot be averaged. 
# To avoid this error, you can select only the numeric columns before applying the mean() function, or you can use the mean() function on specific numeric columns instead of the entire DataFrame.

TypeError: can only concatenate str (not "int") to str

In [66]:
print(df.mean(numeric_only=True)) 
# This line of code calculates the mean of only the numeric columns in the DataFrame 'df' by setting the parameter numeric_only to True. 
#This allows you to get the average values for the numeric columns without encountering errors from non-numeric data.


Num           362.81375
HP             69.25875
Attack         79.00125
Defense        73.84250
SpAtk          72.82000
SpDef          71.90250
Speed          68.27750
Generation      3.32375
Legendary       0.08125
dtype: float64


In [67]:
print(df.sum(numeric_only=True))
# This line of code calculates the sum of only the numeric columns in the DataFrame 'df' by setting the parameter numeric_only to True. 
#This allows you to get the total values for the numeric columns without encountering errors from non-numeric data

Num           290251
HP             55407
Attack         63201
Defense        59074
SpAtk          58256
SpDef          57522
Speed          54622
Generation      2659
Legendary         65
dtype: int64


In [None]:
print(df.min(numeric_only=True)) 
# This line of code calculates the minimum values of only the numeric columns in the DataFrame 'df' by setting the parameter numeric_only to True. 
# This allows you to get the minimum values for the numeric columns without encountering errors from non-n

Num               1
HP                1
Attack            5
Defense           5
SpAtk            10
SpDef            20
Speed             5
Generation        1
Legendary     False
dtype: object


In [69]:
print(df.max(numeric_only=True))
# This line of code calculates the maximum values of only the numeric columns in the DataFrame 'df' by setting the parameter numeric_only to True. 
# This allows you to get the maximum values for the numeric columns without encountering errors from non-n

Num            721
HP             255
Attack         190
Defense        230
SpAtk          194
SpDef          230
Speed          180
Generation       6
Legendary     True
dtype: object


In [None]:
print(df.count(numeric_only=True))
# This line of code counts the number of non-null values in only the numeric columns of the DataFrame 'df' by setting the parameter numeric_only to True. 
# This allows you to get the count of non-null values for the numeric columns without encountering errors

Num           800
HP            800
Attack        800
Defense       800
SpAtk         800
SpDef         800
Speed         800
Generation    800
Legendary     800
dtype: int64


In [71]:
# single column
print(df["Attack"].mean())

79.00125


In [72]:
print(df["Speed"].sum())

54622


In [73]:
print(df["HP"].min())

1


In [77]:
print(df["Speed"].max())

40


In [None]:
print(df["Legendary"].count()) 
# This line of code counts the number of non-null values in the "Legendary" column of the DataFrame 'df' using the count() method. 
# It returns the total count of non-null entries in that specific column.

1


In [None]:
group = df.groupby("Type1")
# This line of code is grouping the DataFrame 'df' by the values in the "Type1" column using the groupby() method. 
# This creates a GroupBy object that allows you to perform aggregate functions on the groups of data
print(group["Speed"].mean()) #This calculates the mean of the "Speed" column for each group of Pokemon based on their primary type ("Type1").
print(group["Attack"].min()) # This calculates the minimum value of the "Attack" column for each group of Pokemon based on their primary type ("Type1").
print(group["HP"].max()) # This calculates the maximum value of the "HP" column for each group of Pokemon based on their primary type ("Type1").
print(group["Speed"].count()) # This calculates the count of non-null values in the "Speed" column for each group of Pokemon based on their primary type ("Type1").

Type1
Bug    40.0
Name: Speed, dtype: float64
Type1
Bug    90
Name: Attack, dtype: int64
Type1
Bug    1
Name: HP, dtype: int64
Type1
Bug    1
Name: Speed, dtype: int64


# Data Cleaning

In [89]:
# Data Cleaning = the process of fixing or removing:
#                 incomplete data, or irrelavant data, or inaccurate data, or improperly formatted data
#                 ~75% of work done with pandas is data cleaning

import pandas as pd

df = pd.read_csv("PokemonData.csv")

# Drop the irrelavent columns
df = df.drop(columns=["SpDef", "SpAtk", "Speed", "Generation", "Defense"])
print(df)

     Num                   Name    Type1   Type2  HP  Attack  Legendary
0      1              Bulbasaur    Grass  Poison  45      49      False
1      2                Ivysaur    Grass  Poison  60      62      False
2      3               Venusaur    Grass  Poison  80      82      False
3      3  VenusaurMega Venusaur    Grass  Poison  80     100      False
4      4             Charmander     Fire     NaN  39      52      False
..   ...                    ...      ...     ...  ..     ...        ...
795  719                Diancie     Rock   Fairy  50     100       True
796  719    DiancieMega Diancie     Rock   Fairy  50     160       True
797  720    HoopaHoopa Confined  Psychic   Ghost  80     110       True
798  720     HoopaHoopa Unbound  Psychic    Dark  80     160       True
799  721              Volcanion     Fire   Water  80     110       True

[800 rows x 7 columns]


In [None]:
# Handle missing data

df = df.dropna(subset=["Type2"]) # This line of code is dropping any rows from the DataFrame 'df' that have missing (NaN) values in the "Type2" column.
# The dropna() method is used to remove rows with missing data, and the subset parameter
# specifies that only rows with missing values in the "Type2" column should be dropped. This helps to clean the data by removing incomplete entries that may affect analysis or modeling.
print(df.to_string())

     Num                       Name     Type1     Type2   HP  Attack  Legendary
0      1                  Bulbasaur     Grass    Poison   45      49      False
1      2                    Ivysaur     Grass    Poison   60      62      False
2      3                   Venusaur     Grass    Poison   80      82      False
3      3      VenusaurMega Venusaur     Grass    Poison   80     100      False
6      6                  Charizard      Fire    Flying   78      84      False
7      6  CharizardMega Charizard X      Fire    Dragon   78     130      False
8      6  CharizardMega Charizard Y      Fire    Flying   78     104      False
15    12                 Butterfree       Bug    Flying   60      45      False
16    13                     Weedle       Bug    Poison   40      35      False
17    14                     Kakuna       Bug    Poison   45      25      False
18    15                   Beedrill       Bug    Poison   65      90      False
19    15      BeedrillMega Beedrill     

In [91]:
df = df.fillna({"Type2": "None"})
# This line of code is filling any missing (NaN) values in the "Type2" column of the DataFrame 'df' with the string "None".
# The fillna() method is used to replace missing values with a specified value, and in
print(df.to_string())

     Num                       Name     Type1     Type2   HP  Attack  Legendary
0      1                  Bulbasaur     Grass    Poison   45      49      False
1      2                    Ivysaur     Grass    Poison   60      62      False
2      3                   Venusaur     Grass    Poison   80      82      False
3      3      VenusaurMega Venusaur     Grass    Poison   80     100      False
6      6                  Charizard      Fire    Flying   78      84      False
7      6  CharizardMega Charizard X      Fire    Dragon   78     130      False
8      6  CharizardMega Charizard Y      Fire    Flying   78     104      False
15    12                 Butterfree       Bug    Flying   60      45      False
16    13                     Weedle       Bug    Poison   40      35      False
17    14                     Kakuna       Bug    Poison   45      25      False
18    15                   Beedrill       Bug    Poison   65      90      False
19    15      BeedrillMega Beedrill     

In [94]:
# Fix inconsistent data
df["Type1"] = df["Type1"].replace({"Ground": "GROUND",
                                   "Fire": "FIRE",
                                   "Water": "WATER"})
# This line of code is replacing any occurrences of the string "Ground" with "GROUND" in the "Type1" column of the DataFrame 'df' using the replace() method.
# This is done to fix inconsistent data, ensuring that all entries for the "Ground" type
print(df.to_string())

     Num                       Name     Type1     Type2   HP  Attack  Legendary
0      1                  Bulbasaur     Grass    Poison   45      49      False
1      2                    Ivysaur     Grass    Poison   60      62      False
2      3                   Venusaur     Grass    Poison   80      82      False
3      3      VenusaurMega Venusaur     Grass    Poison   80     100      False
6      6                  Charizard      FIRE    Flying   78      84      False
7      6  CharizardMega Charizard X      FIRE    Dragon   78     130      False
8      6  CharizardMega Charizard Y      FIRE    Flying   78     104      False
15    12                 Butterfree       Bug    Flying   60      45      False
16    13                     Weedle       Bug    Poison   40      35      False
17    14                     Kakuna       Bug    Poison   45      25      False
18    15                   Beedrill       Bug    Poison   65      90      False
19    15      BeedrillMega Beedrill     

In [96]:
# 4 Standardize text

df["Name"] = df["Name"].str.upper()
# This line of code is converting all the text in the "Name" column of the Data
print(df.to_string())

     Num                       Name     Type1     Type2   HP  Attack  Legendary
0      1                  BULBASAUR     Grass    Poison   45      49      False
1      2                    IVYSAUR     Grass    Poison   60      62      False
2      3                   VENUSAUR     Grass    Poison   80      82      False
3      3      VENUSAURMEGA VENUSAUR     Grass    Poison   80     100      False
6      6                  CHARIZARD      FIRE    Flying   78      84      False
7      6  CHARIZARDMEGA CHARIZARD X      FIRE    Dragon   78     130      False
8      6  CHARIZARDMEGA CHARIZARD Y      FIRE    Flying   78     104      False
15    12                 BUTTERFREE       Bug    Flying   60      45      False
16    13                     WEEDLE       Bug    Poison   40      35      False
17    14                     KAKUNA       Bug    Poison   45      25      False
18    15                   BEEDRILL       Bug    Poison   65      90      False
19    15      BEEDRILLMEGA BEEDRILL     

In [97]:
# 5 Fix data types

df["Legendary"] = df["Legendary"].astype(int)
# This line of code is converting the data type of the "Legendary" column in the DataFrame 'df' to integers using the astype() method.
print(df.to_string())

     Num                       Name     Type1     Type2   HP  Attack  Legendary
0      1                  BULBASAUR     Grass    Poison   45      49          0
1      2                    IVYSAUR     Grass    Poison   60      62          0
2      3                   VENUSAUR     Grass    Poison   80      82          0
3      3      VENUSAURMEGA VENUSAUR     Grass    Poison   80     100          0
6      6                  CHARIZARD      FIRE    Flying   78      84          0
7      6  CHARIZARDMEGA CHARIZARD X      FIRE    Dragon   78     130          0
8      6  CHARIZARDMEGA CHARIZARD Y      FIRE    Flying   78     104          0
15    12                 BUTTERFREE       Bug    Flying   60      45          0
16    13                     WEEDLE       Bug    Poison   40      35          0
17    14                     KAKUNA       Bug    Poison   45      25          0
18    15                   BEEDRILL       Bug    Poison   65      90          0
19    15      BEEDRILLMEGA BEEDRILL     

In [100]:
df = pd.read_csv("PokemonData.csv")
print(df)

     Num                Name    Type1   Type2  HP  Attack  Defense  SpAtk  \
0      1           Bulbasaur    Grass  Poison  45      49       49     65   
1      1           Bulbasaur    Grass  Poison  45      49       49     65   
2      1           Bulbasaur    Grass  Poison  45      49       49     65   
3      1           Bulbasaur    Grass  Poison  45      49       49     65   
4      1           Bulbasaur    Grass  Poison  45      49       49     65   
..   ...                 ...      ...     ...  ..     ...      ...    ...   
804  720  HoopaHoopa Unbound  Psychic    Dark  80     160       60    170   
805  720  HoopaHoopa Unbound  Psychic    Dark  80     160       60    170   
806  720  HoopaHoopa Unbound  Psychic    Dark  80     160       60    170   
807  720  HoopaHoopa Unbound  Psychic    Dark  80     160       60    170   
808  721           Volcanion     Fire   Water  80     110      120    130   

     SpDef  Speed  Generation  Legendary  
0       65     45           1   

In [None]:
# 6 Remove duplicate values
df = df.drop_duplicates()
# This line of code is removing any duplicate rows from the DataFrame 'df' using the    
print(df.to_string())

     Num                       Name     Type1     Type2   HP  Attack  Defense  SpAtk  SpDef  Speed  Generation  Legendary
0      1                  Bulbasaur     Grass    Poison   45      49       49     65     65     45           1      False
5      2                    Ivysaur     Grass    Poison   60      62       63     80     80     60           1      False
6      3                   Venusaur     Grass    Poison   80      82       83    100    100     80           1      False
7      3      VenusaurMega Venusaur     Grass    Poison   80     100      123    122    120     80           1      False
8      4                 Charmander      Fire       NaN   39      52       43     60     50     65           1      False
9      5                 Charmeleon      Fire       NaN   58      64       58     80     65     80           1      False
10     6                  Charizard      Fire    Flying   78      84       78    109     85    100           1      False
11     6  CharizardMega 