In [1]:
%pip install pdfplumber

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary libraries
import requests
import pdfplumber
from io import BytesIO
import re
import pandas as pd



In [3]:
#Function to scrape stocking report pdf
def extract_stocking_data_from_url(pdf_url):
    # Step 1: Download the PDF from the URL
    response = requests.get(pdf_url)
    
    # Check if the request was successful
    if response.status_code != 200:
        print("Failed to retrieve PDF.")
        return None
    
    # Step 2: Open the PDF from the downloaded bytes using pdfplumber
    with pdfplumber.open(BytesIO(response.content)) as pdf:
        text = ""
        # Extract text from each page of the PDF
        for page in pdf.pages:
            text += page.extract_text()

    # Step 3: Process the text and extract the relevant data
    lines = text.split('\n')
    
    # Locate the "Year to Date STOCKING REPORT" and start extracting data from there
    start_index = None
    for i, line in enumerate(lines):
        if "Year to Date STOCKING REPORT" in line:
            start_index = i + 1  # Start after the header line
            break
    
    if start_index is None:
        print("Failed to find the report start line.")
        return None
    
    # Extract rows
    data = []
    for line in lines[start_index:]:
        # Adjusted regex to account for potential variations in spacing, formatting, parentheses, and numbers
        match = re.match(r'(\d{1,2}/\d{1,2}/\d{4})\s+([A-Za-z\s\.()0-9#\'-]+?)\s+(\d+)\s+(\d+)', line.strip())
        if match:
            date, city, qty, size = match.groups()
            # Check for specific fish species names and adjust accordingly
            city = city.strip()
            if "L.L. SALMON" in city:
                species = "L.L. SALMON"
                city = city.replace("L.L. SALMON", "").strip()
            elif "SPLAKE" in city:
                species = "SPLAKE"
                city = city.replace("SPLAKE", "").strip()
            elif "BROOK TROUT" in city:
                species = "BROOK TROUT"
                city = city.replace("BROOK TROUT", "").strip()
            elif "LAKE TROUT" in city:
                species = "LAKE TROUT"
                city = city.replace("LAKE TROUT", "").strip()
            elif "RAINBOW TROUT" in city:
                species = "RAINBOW TROUT"
                city = city.replace("RAINBOW TROUT", "").strip()
            elif "BROWN TROUT" in city:
                species = "BROWN TROUT"
                city = city.replace("BROWN TROUT", "").strip()
            else:
                species = "Unknown"
            
            # Append the data to the list
            data.append([date, city, int(qty), int(size), species])
        else:
            # Optionally print lines that don't match to debug
            print(f"Skipped line (doesn't match): {line}")
    
    # Step 4: Create a DataFrame
    df = pd.DataFrame(data, columns=["Date", "City/Town", "Qty", "Size (inch)", "Fish"])
    
    return df

# PDF URL
pdf_url = 'https://www.maine.gov/ifw/docs/2024%20Annual%20Fish%20Stocking%20Report.pdf'

# Extract the data and show it as a DataFrame
df = extract_stocking_data_from_url(pdf_url)
if df is not None:
    print(df.head())  # Display the first few rows of the DataFrame



Skipped line (doesn't match): Androscoggin County
Skipped line (doesn't match): DATE WATER City/Town SPECIES QTY SIZE (inch)
Skipped line (doesn't match): Page 1 of 43 Annual Stocking Receipt Data 1/2/20252024 Year to Date STOCKING REPORT
Skipped line (doesn't match): Androscoggin County
Skipped line (doesn't match): DATE WATER City/Town SPECIES QTY SIZE (inch)
Skipped line (doesn't match): End of : Androscoggin County
Skipped line (doesn't match): Page 2 of 43 Annual Stocking Receipt Data 1/2/20252024 Year to Date STOCKING REPORT
Skipped line (doesn't match): Aroostook County
Skipped line (doesn't match): DATE WATER City/Town SPECIES QTY SIZE (inch)
Skipped line (doesn't match): Page 3 of 43 Annual Stocking Receipt Data 1/2/20252024 Year to Date STOCKING REPORT
Skipped line (doesn't match): Aroostook County
Skipped line (doesn't match): DATE WATER City/Town SPECIES QTY SIZE (inch)
Skipped line (doesn't match): End of : Aroostook County
Skipped line (doesn't match): Page 4 of 43 Annual

In [4]:
#df shape
df.shape

(1799, 5)

In [5]:
#Print df
print(df)

            Date                               City/Town  Qty  Size (inch)  \
0      4/18/2024                   ANDROSCOGGIN R LISBON  300           10   
1       5/7/2024                   ANDROSCOGGIN R LISBON  200           10   
2      4/25/2024          ANDROSCOGGIN R (LITTLE) AUBURN  515            9   
3      4/25/2024  ANDROSCOGGIN R (LITTLE) MECHANIC FALLS  350            9   
4      4/25/2024           ANDROSCOGGIN R (LITTLE) MINOT  300            9   
...          ...                                     ...  ...          ...   
1794  11/28/2024                          WILSON L ACTON   17           18   
1795   4/10/2024                       WORSTER B BERWICK  200           10   
1796    5/3/2024                       WORSTER B BERWICK  200           10   
1797  10/16/2024                            YORK P ELIOT  200           12   
1798    5/7/2024                            YORK R ELIOT  200           10   

             Fish  
0     BROOK TROUT  
1     BROOK TROUT  
2  

In [6]:
#Check for "unknown"
print(df[df["Fish"]=="Unknown"])

Empty DataFrame
Columns: [Date, City/Town, Qty, Size (inch), Fish]
Index: []


In [7]:
# Define the function to separate bodies of water and towns
def separate_water_and_town(city_town):
    # Regular expression to handle cases like 'R (LITTLE) AUBURN', 'L ACTON', etc.
    # Water body (R, L, P, B) comes before the town name
    pattern = r'^(.*?)(\s+(P|L|R|B)\s?[\w\s\(\)-]+(?:\s*\d*)?$)'

    # Search for the body of water using regex
    match = re.search(pattern, city_town)
    
    if match:
        # Correctly assign the parts, ensure the city comes after the water body
        town = match.group(1).strip()  # This should be the town name
        water = match.group(2).strip()  # This should be the body of water part
        
        return town, water
    else:
        # If no match, return the whole input as the town and None for water
        return city_town, None

# Apply the function to separate the columns
df[['Water', 'City']] = df['City/Town'].apply(lambda x: pd.Series(separate_water_and_town(x)))

# Drop the old 'City/Town' column and reorder the columns to retain all data
df = df.drop(columns=['City/Town'])

# Reorder the columns
df = df[['Date', 'Water', 'City', 'Qty', 'Size (inch)', 'Fish']]

# Print the final DataFrame
print(df)

            Date         Water                       City  Qty  Size (inch)  \
0      4/18/2024  ANDROSCOGGIN                   R LISBON  300           10   
1       5/7/2024  ANDROSCOGGIN                   R LISBON  200           10   
2      4/25/2024  ANDROSCOGGIN          R (LITTLE) AUBURN  515            9   
3      4/25/2024  ANDROSCOGGIN  R (LITTLE) MECHANIC FALLS  350            9   
4      4/25/2024  ANDROSCOGGIN           R (LITTLE) MINOT  300            9   
...          ...           ...                        ...  ...          ...   
1794  11/28/2024        WILSON                    L ACTON   17           18   
1795   4/10/2024       WORSTER                  B BERWICK  200           10   
1796    5/3/2024       WORSTER                  B BERWICK  200           10   
1797  10/16/2024          YORK                    P ELIOT  200           12   
1798    5/7/2024          YORK                    R ELIOT  200           10   

             Fish  
0     BROOK TROUT  
1     BROOK

In [8]:
# Define the function to move the water body abbreviation (R, L, B, P) from the 'City' column to the 'Water' column
def move_water_abbreviation_to_water_column(row):
    # Ensure that the City column is a string before proceeding
    city_value = str(row['City'])

    # Regular expression to detect the water body abbreviation at the start of the City name (R, L, B, P)
    pattern = r'^(R|L|B|P)\s+'

    # Check if the pattern is at the beginning of the City name
    match = re.match(pattern, city_value)

    if match:
        # Move the abbreviation (and the water body) to the 'Water' column
        water_body = match.group(1)  # Capture the water abbreviation
        water = row['Water'] + ' ' + water_body  # Append it to the existing water column value
        city = city_value[match.end():].strip()  # Remove the water abbreviation from the City name
        
        # Update the row with the new values
        return pd.Series([city, water], index=['City', 'Water'])
    else:
        # If no match, return the row as it is
        return row[['City', 'Water']]

# Assuming you have your dataframe already
# Apply the function row by row
df[['City', 'Water']] = df.apply(move_water_abbreviation_to_water_column, axis=1)

# Print the resulting dataframe
print(df)

            Date           Water                     City  Qty  Size (inch)  \
0      4/18/2024  ANDROSCOGGIN R                   LISBON  300           10   
1       5/7/2024  ANDROSCOGGIN R                   LISBON  200           10   
2      4/25/2024  ANDROSCOGGIN R          (LITTLE) AUBURN  515            9   
3      4/25/2024  ANDROSCOGGIN R  (LITTLE) MECHANIC FALLS  350            9   
4      4/25/2024  ANDROSCOGGIN R           (LITTLE) MINOT  300            9   
...          ...             ...                      ...  ...          ...   
1794  11/28/2024        WILSON L                    ACTON   17           18   
1795   4/10/2024       WORSTER B                  BERWICK  200           10   
1796    5/3/2024       WORSTER B                  BERWICK  200           10   
1797  10/16/2024          YORK P                    ELIOT  200           12   
1798    5/7/2024          YORK R                    ELIOT  200           10   

             Fish  
0     BROOK TROUT  
1     BROOK

In [9]:
df_side = df

In [10]:
# Define the function to move parentheses or hashtag numbers from 'City' to 'Water'
def move_parentheses_or_hashtags_to_water_column(row):
    # Ensure that the City column is a string before proceeding
    city_value = str(row['City'])

    # Regular expression to detect parentheses or hashtag numbers at the start (e.g., "(LITTLE)" or "#1")
    pattern = r'^[\(\#][\w\s]+[\)\#]\s*'  # Matches (anything) or #number at the start of the City column

    # Check if the pattern matches the start of the City value
    match = re.match(pattern, city_value)

    if match:
        # Extract the matched content (e.g., (LITTLE) or #1)
        parentheses_or_hashtag = match.group(0).strip()

        # Update the Water column by appending the parentheses or hashtag content
        water = row['Water'] + ' ' + parentheses_or_hashtag

        # Remove the matched parentheses or hashtag content from the City name
        city = city_value[match.end():].strip()  # Remove the matched part from the City column
        
        # Update the row with the new values
        return pd.Series([city, water], index=['City', 'Water'])
    else:
        # If no match, return the row as it is
        return row[['City', 'Water']]

# Assuming you have your dataframe already
# Apply the function row by row
df[['City', 'Water']] = df.apply(move_parentheses_or_hashtags_to_water_column, axis=1)

# Print the resulting dataframe
print(df)



            Date                    Water            City  Qty  Size (inch)  \
0      4/18/2024           ANDROSCOGGIN R          LISBON  300           10   
1       5/7/2024           ANDROSCOGGIN R          LISBON  200           10   
2      4/25/2024  ANDROSCOGGIN R (LITTLE)          AUBURN  515            9   
3      4/25/2024  ANDROSCOGGIN R (LITTLE)  MECHANIC FALLS  350            9   
4      4/25/2024  ANDROSCOGGIN R (LITTLE)           MINOT  300            9   
...          ...                      ...             ...  ...          ...   
1794  11/28/2024                 WILSON L           ACTON   17           18   
1795   4/10/2024                WORSTER B         BERWICK  200           10   
1796    5/3/2024                WORSTER B         BERWICK  200           10   
1797  10/16/2024                   YORK P           ELIOT  200           12   
1798    5/7/2024                   YORK R           ELIOT  200           10   

             Fish  
0     BROOK TROUT  
1     BROOK

In [11]:
#manual check for errors in city name
df["City"].unique()

array(['LISBON', 'AUBURN', 'MECHANIC FALLS', 'MINOT', 'LIVERMORE',
       'TURNER', 'DURHAM', 'PARK P AUBURN', 'POLAND', 'LEWISTON',
       'BROOK L PRESQUE ISLE', 'T19 R11 WELS', 'T14 R9 WELS',
       'SAINT JOHN PLT', 'T15 R9 WELS', 'HOULTON', 'LITTLETON',
       'MONTICELLO', 'GRAND ISLE', 'T16 R5 WELS', 'NEW CANADA', 'WESTON',
       'EAGLE LAKE', 'LINNEUS', 'LIMESTONE', 'PRESQUE ISLE',
       'RIVER FORT KENT', 'T17 R12 WELS', 'T18 R10 WELS', 'ORIENT',
       'MORO PLT', 'REED PLT', 'BROOK L MAPLETON', None, 'T14 R8 WELS',
       'T17 R4 WELS', 'T16 R4 WELS', 'T4 R3 WELS', 'HAYNESVILLE',
       'OAKFIELD', 'FORKSTOWN TWP', 'T11 R10 WELS', 'TC R2 WELS',
       'FORT KENT', 'MOLUNKUS TWP', 'FORT FAIRFIELD', 'NEW LIMERICK',
       'T9 R3 WELS', 'ISLAND FALLS', 'PORTAGE LAKE', 'T19 R12 WELS',
       'T11 R8 WELS', 'T11 R4 WELS', 'SQUAPAN TWP',
       'BROOK L (THIRD) NEW CANADA', 'HODGDON', 'DYER BROOK',
       'T12 R14 WELS', 'T7 R5 WELS', 'BRIDGTON', 'GORHAM', 'SEBAGO',
       'WEST

In [12]:
#Checking sandy river plt rows
df[df["City"] == "RIVER P (MIDDLE) SANDY RIVER PLT"]

Unnamed: 0,Date,Water,City,Qty,Size (inch),Fish
443,5/14/2024,SANDY,RIVER P (MIDDLE) SANDY RIVER PLT,600,10,BROOK TROUT
444,5/14/2024,SANDY,RIVER P (MIDDLE) SANDY RIVER PLT,200,10,BROOK TROUT
445,10/22/2024,SANDY,RIVER P (MIDDLE) SANDY RIVER PLT,300,13,BROOK TROUT


In [13]:
#values for later cleaning
fixlater = ['P) BELGRADE BROOK TROUT', 'P) BELGRADE BROWN TROUT', 'BPP BROOK TROUT']

In [14]:
#values to fix now
fixnow= ['PARK P AUBURN','BROOK L PRESQUE ISLE', 'RIVER FORT KENT', 'BROOK L MAPLETON', 'PORTAGE LAKE', 'BROOK L (THIRD) NEW CANADA', 'RAIN P NAPLES', 'BLUE P AVON', 'BLUE HIGH SCHOOL P FARMINGTON', 'POND TOWNSHIP 6 NORTH OF', 'RIVER P (MIDDLE) SANDY RIVER PLT', 'RIVER P TOWNSHIP E', 'LAND L T35 MD', 'PISTOL L T3 ND', 'RIVER L T10 SD',  'LAKES TOWNSHIP C', 'LAKES RICHARDSONTOWN TWP', 'PIT P MOOSEHEAD JUNCTION', 'BOG SHAWTOWN TWP', 'RIVER T3 R4 BKP WKR', 'REC (PAL) P FAIRFIELD', 'ROCK P PIERCE POND TWP', 'BERTH P DENNISTOWN PLT', 'RIVER BELFAST', 'RIVER SWANVILLE','BAY STATE PARK P EDMUNDS TWP', 'LAKE STREAM GRAND LAKE STREAM P', 'LAKE STREAM CANAL GRAND LAKE STREAM P', 'PUG L T26 ED BPP', 'LEDGE) P CHARLOTTE', 'BROOK P (EAST) T18 MD BPP', 'BROOK P (WEST) T18 MD BPP', 'RIVER L BEDDINGTON', 'BARREL L T6 R1 NBPP', 'POND B DAYTON',]

In [15]:
# Filter the dataframe where "City" is in the "fixnow" list
filtered_df = df[df["City"].isin(fixnow)]

# Select the "City" and "Water" columns together
paired_values = filtered_df[["Water", "City"]]

# Display the paired values
print(paired_values.to_string())

                             Water                                   City
47                      PETTINGILL                          PARK P AUBURN
85                          ARNOLD                   BROOK L PRESQUE ISLE
86                          ARNOLD                   BROOK L PRESQUE ISLE
110                           FISH                        RIVER FORT KENT
111                           FISH                        RIVER FORT KENT
122                         HANSON                       BROOK L MAPLETON
155   PORTAGE HILLS COUNTRY CLUB P                           PORTAGE LAKE
165                            SLY             BROOK L (THIRD) NEW CANADA
197                           COLD                          RAIN P NAPLES
198                           COLD                          RAIN P NAPLES
199                           COLD                          RAIN P NAPLES
398                          MOUNT                            BLUE P AVON
399                          MOUNT    

In [16]:
#Dict for manual changes
changes_dict = {
    'PARK P AUBURN': {'Water': 'PETTINGILL PARK P', 'City': 'AUBURN'},
    'BROOK L PRESQUE ISLE': {'Water': 'ARNOLD BROOK L', 'City': 'PRESQUE ISLE'},
    'RIVER FORT KENT': {'Water': 'FISH RIVER', 'City': 'FORT KENT'},
    'BROOK L MAPLETON': {'Water': 'HANSON BROOK L', 'City': 'MAPLETON'},
    'PORTAGE LAKE': {'Water': 'PORTAGE HILLS COUNTRY CLUB P', 'City': 'PORTAGE LAKE'},
    'BROOK L (THIRD) NEW CANADA': {'Water': 'SLY BROOK L (THIRD)', 'City': 'NEW CANADA'},
    'RAIN P NAPLES': {'Water': 'COLD RAIN P', 'City': 'NAPLES'},
    'BLUE P AVON': {'Water': 'MOUNT BLUE P', 'City': 'AVON'},
    'BLUE HIGH SCHOOL P FARMINGTON': {'Water': 'MT BLUE HIGH SCHOOL P', 'City': 'FARMINGTON'},
    'POND TOWNSHIP 6 NORTH OF': {'Water': 'MUD POND', 'City': 'TOWNSHIP 6 NORTH OF'},
    'RIVER P (MIDDLE) SANDY RIVER PLT': {'Water': 'SANDY RIVER P (MIDDLE)', 'City': 'SANDY RIVER PLT'},
    'RIVER P TOWNSHIP E': {'Water': 'SWIFT RIVER P', 'City': 'TOWNSHIP E'},
    'LAND L T35 MD': {'Water': 'BURNT LAND L', 'City': 'T35 MD'},
    'PISTOL L T3 ND': {'Water': 'SIDE PISTOL L', 'City': 'T3 ND'},
    'RIVER L T10 SD': {'Water': 'SPRING RIVER L', 'City': 'T10 SD'},
    'LAKES TOWNSHIP C': {'Water': 'RICHARDSON LAKES', 'City': 'TOWNSHIP C'},
    'LAKES RICHARDSONTOWN TWP': {'Water': 'RICHARDSON LAKES', 'City': 'RICHARDSONTOWN TWP'},
    'PIT P MOOSEHEAD JUNCTION': {'Water': 'GRAVEL PIT P', 'City': 'MOOSEHEAD JUNCTION'},
    'RIVER T3 R4 BKP WKR': {'Water': 'DEAD RIVER', 'City': 'T3 R4 BKP WKR'},
    'REC (PAL) P FAIRFIELD': {'Water': 'FAIRFIELD REC (PAL) P', 'City': 'FAIRFIELD'},
    'ROCK P PIERCE POND TWP': {'Water': 'SPLIT ROCK P PIERCE POND', 'City': 'TWP'},
    'BERTH P DENNISTOWN PLT': {'Water': 'SUGAR BERTH P', 'City': 'DENNISTOWN PLT'},
    'RIVER BELFAST': {'Water': 'GOOSE RIVER', 'City': 'BELFAST'},
    'RIVER SWANVILLE': {'Water': 'GOOSE RIVER', 'City': 'SWANVILLE'},
    'BAY STATE PARK P EDMUNDS TWP': {'Water': 'COBSCOOK BAY STATE PARK P', 'City': 'EDMUNDS TWP'},
    'LAKE STREAM GRAND LAKE STREAM P': {'Water': 'GRAND LAKE STREAM', 'City': 'GRAND LAKE STREAM P'},
    'LAKE STREAM CANAL GRAND LAKE STREAM P': {'Water': 'GRAND LAKE STREAM CANAL', 'City': 'GRAND LAKE STREAM P'},
    'PUG L T26 ED BPP': {'Water': 'HOSEA PUG L T26 ED BPP', 'City': 'T26 ED BPP'},
    'LEDGE) P CHARLOTTE': {'Water': 'LEDGE (BALD LEDGE) P CHARLOTTE', 'City': 'CHARLOTTE'},
    'BROOK P (EAST) T18 MD BPP': {'Water': 'PIKE BROOK P (EAST)', 'City': 'T18 MD BPP'},
    'BROOK P (WEST) T18 MD BPP': {'Water': 'PIKE BROOK P (WEST)', 'City': 'T18 MD BPP'},
    'RIVER L BEDDINGTON': {'Water': 'PLEASANT RIVER L', 'City': 'BEDDINGTON'},
    'BARREL L T6 R1 NBPP': {'Water': 'PORK BARREL L', 'City': 'T6 R1 NBPP'},
    'POND B DAYTON': {'Water': 'SWAN POND B', 'City': 'DAYTON'}
}


In [17]:
#Check fixlater is done
newone = df[df["City"].isin(fixlater)]
print(newone)

Empty DataFrame
Columns: [Date, Water, City, Qty, Size (inch), Fish]
Index: []


In [18]:
# Iterate through the dictionary and apply the changes
for city, values in changes_dict.items():
    # Update the 'Water' and 'City' columns where the City matches
    df.loc[df['City'] == city, 'Water'] = values['Water']
    df.loc[df['City'] == city, 'City'] = values['City']


In [19]:
#Visual check for nay other cities to fix
df["City"].unique()

array(['LISBON', 'AUBURN', 'MECHANIC FALLS', 'MINOT', 'LIVERMORE',
       'TURNER', 'DURHAM', 'POLAND', 'LEWISTON', 'PRESQUE ISLE',
       'T19 R11 WELS', 'T14 R9 WELS', 'SAINT JOHN PLT', 'T15 R9 WELS',
       'HOULTON', 'LITTLETON', 'MONTICELLO', 'GRAND ISLE', 'T16 R5 WELS',
       'NEW CANADA', 'WESTON', 'EAGLE LAKE', 'LINNEUS', 'LIMESTONE',
       'FORT KENT', 'T17 R12 WELS', 'T18 R10 WELS', 'ORIENT', 'MORO PLT',
       'REED PLT', 'MAPLETON', None, 'T14 R8 WELS', 'T17 R4 WELS',
       'T16 R4 WELS', 'T4 R3 WELS', 'HAYNESVILLE', 'OAKFIELD',
       'FORKSTOWN TWP', 'T11 R10 WELS', 'TC R2 WELS', 'MOLUNKUS TWP',
       'FORT FAIRFIELD', 'NEW LIMERICK', 'T9 R3 WELS', 'ISLAND FALLS',
       'PORTAGE LAKE', 'T19 R12 WELS', 'T11 R8 WELS', 'T11 R4 WELS',
       'SQUAPAN TWP', 'HODGDON', 'DYER BROOK', 'T12 R14 WELS',
       'T7 R5 WELS', 'BRIDGTON', 'GORHAM', 'SEBAGO', 'WESTBROOK',
       'NEW GLOUCESTER', 'WINDHAM', 'NORTH YARMOUTH', 'POWNAL', 'CASCO',
       'BRUNSWICK', 'NAPLES', 'GRAY', 

In [20]:
df[df["Water"]=="SUGAR BERTH P P"]

Unnamed: 0,Date,Water,City,Qty,Size (inch),Fish


In [21]:
#Manual fix of values with dictionary
changes_dict2 = {
    'BROOK TROUT': {
        'Water': 'ANDERSON P',        
        'City': 'T10 SD',            
        'Fish': 'BROOK TROUT'    
    },
    'L (ELLIS P) BELGRADE': {
        'Water': 'SALMON L (ELLIS P)',
        'City': 'BELGRADE',
        'Fish': 'BROOK TROUT'  
    },
    'P T4 R13 WELS': {
        'Water': 'SALMON P',
        'City': 'T4 R13 WELS',
        'Fish': 'BROOK TROUT'  
         },
    'P T10 SD': {
        'Water': 'RAINBOW P',
        'City': 'T10 SD',
        'Fish': 'BROOK TROUT'  
    },
    'P T30 MD BPP': {
        'Water': 'SALMON P',
        'City': 'T30 MD BPP',
        'Fish': 'BROOK TROUT'  
    },
    'L (ELLIS P) BELGRADE BROOK TROUT': {
        'Water': 'SALMON L (ELLIS P)',
        'City': 'BELGRADE',
        'Fish': 'BROOK TROUT'  
    },
    'L (ELLIS P) BELGRADE BROWN TROUT': {
        'Water': 'SALMON L (ELLIS P)',
        'City': 'BELGRADE',
        'Fish': 'BROWN TROUT'  
    },
}


In [22]:
#Check dict fixes worked
print(df.iloc[[581,689,690,691,1265,1615,1616]])

            Date               Water         City  Qty  Size (inch)  \
581    10/3/2024            SALMON P       T10 SD  300            8   
689    4/12/2024  SALMON L (ELLIS P)     BELGRADE  440           10   
690   10/29/2024  SALMON L (ELLIS P)     BELGRADE  500           13   
691   10/29/2024  SALMON L (ELLIS P)     BELGRADE  400           12   
1265  10/15/2024            SALMON P  T4 R13 WELS  900            8   
1615   10/1/2024            SALMON P   T30 MD BPP   50           14   
1616   10/1/2024            SALMON P   T30 MD BPP  400            8   

             Fish  
581   BROOK TROUT  
689   BROOK TROUT  
690   BROOK TROUT  
691   BROWN TROUT  
1265  BROOK TROUT  
1615  BROOK TROUT  
1616  BROOK TROUT  


In [23]:
#get index of upton 
df[df["City"]=="POND UPTON"]

Unnamed: 0,Date,Water,City,Qty,Size (inch),Fish
857,5/16/2024,B,POND UPTON,350,7,L.L. SALMON


In [24]:
# Swap the values in city and water for 857, 1205, 402, 562, 342, 349, 350 using iloc
df.iloc[857, 2] = "UPTON"
df.iloc[857, 1] = "B POND"
df.iloc[1205, 2] = "SHAWTOWN TWP"
df.iloc[1205, 1] = "LONG BOG"
df.iloc[342, 2] = 'CHAIN OF PONDS TWP'
df.iloc[342, 1] = 'CARIBOU BOG'
df.iloc[349, 2] = "CHAIN OF PONDS TWP"
df.iloc[349, 1] = "CHAIN OF PONDS"
df.iloc[350, 2] = "CHAIN OF PONDS TWP"
df.iloc[350, 1] = "CHAIN OF PONDS"

# Swap value in water for 1553 and 1566 using iloc
df.iloc[1553, 1] = "HOSEA PUG L"
df.iloc[1566, 1] = "LEDGE (BALD LEDGE) P"

#Swap value in city for 402, 462 using iloc
df.iloc[402, 2] = "TOWNSHIP 6 NORTH OF WELD"
df.iloc[462, 2] = "TOWNSHIP 6 NORTH OF WELD"

In [25]:
#Rename fish column to "Species"
df.rename(columns={'Fish': 'Species'}, inplace=True)

In [26]:
#Reorder columns
df = df[["Date", "Water", "City", "Species", "Qty", "Size (inch)"]]
print(df)

            Date                    Water            City      Species  Qty  \
0      4/18/2024           ANDROSCOGGIN R          LISBON  BROOK TROUT  300   
1       5/7/2024           ANDROSCOGGIN R          LISBON  BROOK TROUT  200   
2      4/25/2024  ANDROSCOGGIN R (LITTLE)          AUBURN  BROWN TROUT  515   
3      4/25/2024  ANDROSCOGGIN R (LITTLE)  MECHANIC FALLS  BROWN TROUT  350   
4      4/25/2024  ANDROSCOGGIN R (LITTLE)           MINOT  BROWN TROUT  300   
...          ...                      ...             ...          ...  ...   
1794  11/28/2024                 WILSON L           ACTON  BROWN TROUT   17   
1795   4/10/2024                WORSTER B         BERWICK  BROOK TROUT  200   
1796    5/3/2024                WORSTER B         BERWICK  BROOK TROUT  200   
1797  10/16/2024                   YORK P           ELIOT  BROOK TROUT  200   
1798    5/7/2024                   YORK R           ELIOT  BROOK TROUT  200   

      Size (inch)  
0              10  
1          

In [27]:
#Check for only 6 species
df["Species"].unique()

array(['BROOK TROUT', 'BROWN TROUT', 'RAINBOW TROUT', 'LAKE TROUT',
       'L.L. SALMON', 'SPLAKE'], dtype=object)

In [28]:
#look for null city values
df[df["City"].isnull()]


Unnamed: 0,Date,Water,City,Species,Qty,Size (inch)
123,9/30/2024,HODGDON DEADWATER HODGDON,,BROWN TROUT,250,12
164,9/26/2024,SCOPAN STREAM MASARDIS,,L.L. SALMON,350,12
247,4/23/2024,OTTER P #2 STANDISH,,BROOK TROUT,200,10
248,10/15/2024,OTTER P #2 STANDISH,,BROOK TROUT,600,8
249,10/15/2024,OTTER P #2 STANDISH,,BROOK TROUT,125,12
250,10/31/2024,OTTER P #2 STANDISH,,BROOK TROUT,15,17
251,11/19/2024,OTTER P #2 STANDISH,,BROOK TROUT,400,8
252,4/23/2024,OTTER P #4 STANDISH,,BROOK TROUT,100,10
253,10/15/2024,OTTER P #4 STANDISH,,BROOK TROUT,300,8
254,10/15/2024,OTTER P #4 STANDISH,,BROOK TROUT,75,12


In [29]:
#Function to move town name from water column to town
def move_last_word_to_city(df):
    # Filter the rows where 'City' is null
    city_null_rows = df[df['City'].isnull()]
    
    # Apply the transformation to those rows
    for idx, row in city_null_rows.iterrows():
        # Get the current value in the 'Water' column
        water_value = row['Water']
        
        # Split the 'Water' column by spaces and get the last word
        words = water_value.split()
        if len(words) > 1:  # Ensure there's more than one word to move
            last_word = words[-1]  # Get the last word
            # Move the last word to the 'City' column
            df.at[idx, 'City'] = last_word
            # Remove the last word from the 'Water' column
            df.at[idx, 'Water'] = ' '.join(words[:-1])
    
    return df

# Apply the function to your DataFrame
df = move_last_word_to_city(df)


In [30]:
#Confirm null values fixed
df[df["City"].isnull()]

Unnamed: 0,Date,Water,City,Species,Qty,Size (inch)


In [31]:
#filling in shortened values
df['City'].replace('GRAND LAKE STREAM P', 'GRAND LAKE STREAM PLT', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['City'].replace('GRAND LAKE STREAM P', 'GRAND LAKE STREAM PLT', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['City'].replace('GRAND LAKE STREAM P', 'GRAND LAKE STREAM PLT', inplace=True)


In [32]:
#More shortened values
df['City'].replace('SANDWICH ACADEMY G', 'SANDWICH ACADEMY GRANT TWP', inplace=True)
df['City'].replace('EAST MIDDLESEX CANA', 'EAST MIDDLESEX CANAL GRANT TWP', inplace=True)
df['City'].replace('T4 INDIAN PURCHASE T', 'T4 INDIAN PURCHASE TOWNSHIP', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['City'].replace('SANDWICH ACADEMY G', 'SANDWICH ACADEMY GRANT TWP', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['City'].replace('SANDWICH ACADEMY G', 'SANDWICH ACADEMY GRANT TWP', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using

In [33]:
#Save as new csv
df.to_csv('Maine_Stocking_24.csv', index=False)