In [None]:
import pandas as pd
import warnings

# Suppress all warnings to keep output clean
warnings.filterwarnings("ignore")

# Initialize dictionaries to store station information
total_station_info = {}
total_one_to_many = {}

# Define the range of years for which data is available
years = range(2000, 2024)

# Iterate over each year, load the dataset, and process station data
for year in years:
    df = pd.read_csv(f'{year} Water Quality Archive Unified.csv')
    # Group the dataset by station number
    for station_number, group in df.groupby('station_number'):
        # If the station's name, type, and coordinates are consistent across all records
        if len(group[['station_name', 'station_type', 'northing', 'easting', 'ngr']].drop_duplicates()) == 1:
            row = group.iloc[0]
            # Store the station information if it's not already in the dictionary
            if station_number not in total_station_info:
                total_station_info[station_number] = {
                    'Station Name': row['station_name'],
                    'Station Type': row['station_type'],
                    'Northing': row['northing'],
                    'Easting': row['easting'],
                    'ngr': row['ngr'],
                    'Years': []  # Initialize a list to track the years of data for this station
                }
            # Add the current year to the list of years for this station
            total_station_info[station_number]['Years'].append(year)
        else:
            # If the station has inconsistent records across years, store it in the 'one-to-many' dictionary
            if station_number not in total_one_to_many:
                total_one_to_many[station_number] = []
            total_one_to_many[station_number].append(year)

# Convert the total_station_info dictionary to a DataFrame
station_info_df = pd.DataFrame.from_dict(total_station_info, orient='index')

# Convert the list of years into a comma-separated string for saving to CSV
station_info_df['Years'] = station_info_df['Years'].apply(lambda x: ', '.join(map(str, sorted(x))))

# Save the station information DataFrame to a CSV file, overwriting any existing file
station_info_df.to_csv('total_station_info_1.csv', mode='w')

# Convert the total_one_to_many dictionary to a DataFrame and save to CSV
one_to_many_df = pd.DataFrame([(station, years) for station, years in total_one_to_many.items()],
                              columns=['Station Number', 'Years'])
one_to_many_df.to_csv('total_one_to_many.csv', mode='w', index=False)

# Print the number of entries in each DataFrame for confirmation
station_info_count = len(station_info_df)
print(f"Total entries in station_info: {station_info_count}")

one_to_many_count = len(one_to_many_df)
print(f"\nTotal entries in one_to_many: {one_to_many_count}")



In [None]:
import pandas as pd

# Read the CSV file containing station information
station_info_df = pd.read_csv('total_station_info_1.csv')

# Print the contents of the DataFrame to inspect the data
print(station_info_df)

# Convert the DataFrame into an HTML table format
html_table = station_info_df.to_html(index=False)

# Print the generated HTML table for viewing or further use
print(html_table)



In [None]:
import pandas as pd

# Read the CSV file containing station information
station_info_df = pd.read_csv('total_station_info_1.csv')

# Ensure the 'Years' column is a string, then split the years into a list
station_info_df['Years'] = station_info_df['Years'].astype(str).apply(lambda x: x.split(', '))

# Count the number of years each station appears in and store the result in a new 'Year Count' column
station_info_df['Year Count'] = station_info_df['Years'].apply(len)

# Convert the 'Years' list back to a string format for saving
station_info_df['Years'] = station_info_df['Years'].apply(lambda x: ', '.join(x))

# Save the updated DataFrame with the year count as a new CSV file
station_info_df.to_csv('total_station_info_with_counts.csv', index=False)

# Print the DataFrame to check the results
print(station_info_df)



In [None]:
import pandas as pd

# Read the CSV file containing station information and year counts
station_info_df = pd.read_csv('total_station_info_with_counts.csv')

# Filter out stations that have only 1 year of data
filtered_df = station_info_df[station_info_df['Year Count'] != 1]

# Sort the filtered DataFrame by the 'Year Count' column in descending order
sorted_df = filtered_df.sort_values(by='Year Count', ascending=False)

# Print the number of stations for each unique 'Year Count' value
for year_count in sorted_df['Year Count'].unique():
    stations_with_same_count = sorted_df[sorted_df['Year Count'] == year_count]
    print(f"There are {len(stations_with_same_count)} stations with {year_count} years of record:")





In [None]:
import pandas as pd

# Read the saved CSV file with station information
station_info_df = pd.read_csv('total_station_info_with_counts.csv', index_col=0)

# Standardize the text in the 'Station Type' column by removing extra spaces and converting to lowercase
station_info_df['Station Type'] = station_info_df['Station Type'].str.strip().str.lower()

# Filter out the stations where 'Station Type' contains the term 'minewater'
minewater_stations_df = station_info_df[station_info_df['Station Type'].str.contains('minewater', na=False)]

# Save the filtered minewater station information to a new CSV file
minewater_stations_df.to_csv('minewater_stations_info.csv', index=False)

# Display the filtered minewater station information
print("Minewater Stations Information:")
print(minewater_stations_df)


In [None]:
import pandas as pd

# Load the CSV file
station_info_df = pd.read_csv('total_station_info_with_counts.csv', index_col=0)

# Standardize the 'Station Type' column text
station_info_df['Station Type'] = station_info_df['Station Type'].str.strip().str.lower()

# Filter for stations with 'Station Type' containing 'minewater'
minewater_stations_df = station_info_df[station_info_df['Station Type'].str.contains('minewater', na=False)]

# Sort the DataFrame by 'Year' in descending order
if 'Year Count' in minewater_stations_df.columns:
    minewater_stations_df = minewater_stations_df.sort_values(by='Year Count', ascending=False)

# Save the filtered and sorted minewater station information to a new CSV file
minewater_stations_df.to_csv('minewater_stations_info_sorted.csv', index=False)

# Display the filtered and sorted minewater station information
print("Minewater Stations Information Sorted:")
print(minewater_stations_df)


In [None]:
import pandas as pd

# Load the previously saved 'total_station_info_with_counts.csv' file
station_info_df = pd.read_csv('total_station_info_with_counts.csv', index_col=0)

# Standardize the 'Station Type' column by removing extra spaces and converting to lowercase
station_info_df['Station Type'] = station_info_df['Station Type'].str.strip().str.lower()

# Filter the DataFrame to include only stations where 'Station Type' contains 'minewater'
minewater_stations_df = station_info_df[station_info_df['Station Type'].str.contains('minewater', na=False)]

# Count the number of minewater stations based on 'Year Count' and sort the results in descending order
year_count_summary = minewater_stations_df['Year Count'].value_counts().sort_index(ascending=False)

# Print the summary of minewater stations by their Year Count
print("Number of minewater stations by Year Count:")
print(year_count_summary)
