# Introduction(NEW MAIN)-->

In [35]:
import requests 
import os
from tqdm import tqdm # just for downloading progress bar

#URL of the MET Office historic station data page
url = 'https://www.metoffice.gov.uk/research/climate/maps-and-data/historic-station-data'

#Directory to save downloaded files
output_dir= 'MET_Office_Data'

#Create the output directory if it doesn't exist yet
os.makedirs(output_dir,exist_ok=True)

#Send a GET request to the URL
response= requests.get(url)

#Check if the request was successful (status code 200)
if response.status_code==200:
    #Extract text data file URLs from the response content
    file_urls= [line.split('"')[1] for line in response.text.splitlines() if '.txt' in line]
                
    #After ggetting all urls of the .txt files we download them            
    for file_url in tqdm(file_urls,desc="Downloading files", unit="file"):
        #Check if the URL has a valid scheme
         if file_url.startswith(('http://','https://')):
        #Extract filename from the URL
            filename = file_url.split('/')[-1]
        #Download the file
            with open(os.path.join(output_dir,filename),'wb') as f:
                f.write(requests.get(file_url).content)
    else:
        print(f"Skipping invalid URL:{file_url}")

else:
    print("Failed to retrieve data.Status code:",response.status_code)


Downloading files: 100%|██████████████████████| 75/75 [00:09<00:00,  8.17file/s]

Skipping invalid URL:type





In [37]:
import os

#Directory containing the text files
input_dir = 'MET_Office_Data'

#Directory to save cleaned files
output_dir= 'NoHeader_MET_Office_Data'

#Create the output directory if it doesn't exist
os.makedirs(output_dir,exist_ok= True)

#Loop through each file in the input directory
for filename in os.listdir(input_dir):
    #check if the file is a text file
    if filename.endswith('.txt'):
        #Read the contets of the file 
        with open(os.path.join(input_dir,filename), 'r') as file:
            lines = file.readlines()

       #Determine the number of header lines
        header_length = 5 if 'sunshine data' in lines[4].lower() else 6

        #Remove header
        data = lines[header_length:]

        #Write cleaned data to a new file
        with open(os.path.join(output_dir,filename),'w') as file:
            file.writelines(data)

print("Cleaning completed.")

Cleaning completed.


In [38]:
import os

#directory containing the cleaned text files
input_dir = 'NoHeader_MET_Office_Data'

for filename in os.listdir(input_dir): #Lopping through each file
    #Chech if the file is a text file
    if filename.endswith('.txt'):
        #Read the contents of the file
        with open(os.path.join(input_dir,filename),'r')as file:
            lines= file.readlines()
        #Remove the second row(header)
        cleaned_lines = [lines[0]] #Keep the first row
        cleaned_lines += lines[2:] #Skip the second row

        #Write cleaned data back to the same file
        with open(os.path.join(input_dir,filename),'w') as file:
            file.writelines(cleaned_lines)
print("Cleaning completed")

Cleaning completed


In [39]:
'''
Data Collection, Pre-processing and Handling (30 marks): Download the
MET historic data from all 36 weather stations in the UK. Select a monitoring
time period when your data from all your selected weather stations have been
consistently recorded. Clean the data from any special symbols, e.g. ‘*’ or ‘#’.
Visualise and handle missing data, for example by replacing any missing values
with a zero or the mean values from all stations for that year.
'''
#Assign directory with tilde(~)
import os
import pandas as pd

def convert_txt_to_csv(input_folder, output_folder):
    #Make syre the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    #Loop through all text files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_filepath= os.path.join(input_folder,filename)
            output_filepath = os.path.join(output_folder,os.path.splitext(filename)[0]+".csv")

            #Read the fixed-width format file
            df= pd.read_fwf(input_filepath)
            #Save the DataFrame to a CSV file
            df.to_csv(output_filepath, index=False)

#Usage of convert_txt_to_csv()
input_folder = 'NoHeader_MET_Office_Data'
output_folder = 'DataAE1.CSV'
convert_txt_to_csv(input_folder,output_folder)

In [40]:
import pandas as pd

#Directory containing CSV files
input_directory = 'DataAE1.csv'
#Directory to save modified CSV files
output_directory = 'DataAE1csv.cleaned'

#Create the output directory if it doesn't exist
os.makedirs(output_directory,exist_ok= True)

#Loop through all CSV files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        input_filepath=os.path.join(input_directory,filename)
        #Read CSV file into DataFrame
        df=pd.read_csv(input_filepath)
        #Replace '---' with 0
        df.replace('---',0,inplace=True)
        #Save modified DataFrame to a new CSV file in the output directory
        output_filepath= os.path.join(output_directory,f'cleaned_{filename}')
        df.to_csv(output_filepath,index=False)
