Importing Libraries

In [57]:
!pip install transformers
!pip install torch
!pip install pandas
!pip3 install together
!pip3 install openai



In [58]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch, pandas as pd
import subprocess
import os
from openai import OpenAI
from together import Together

In [59]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# print(os.getenv("OPENAI_API_KEY"))
# print(os.getenv("TOGETHER_API_KEY"))

Loading datasets

In [52]:
class LlmBaseline:
    def __init__(self, models=None, df=None):
        self.models = models if models else []
        self.dataframes = df if df else []
        self.summaries = []

    def add_df(self, df):
        self.dataframes.append(df)

    def read_dataset_path(self, path):
        self.dataframes.append(pd.read_csv(path))

    def get_all_df(self):
        return self.dataframes

    def generate_summary(self, model_name, prompt):
        # Send the prompt to model via Ollama (no model loaded in memory)
        result = subprocess.run(
            ["ollama", "run", model_name],
            input=prompt,
            text=True,
            capture_output=True
        )
        
        return result.stdout.strip()

    def generate_promt(self, data):
        prompt = f"""
            Based on the given data:
            {data}
            Your task is to write a brief, fluent, and coherent single-paragraph summary
            in natural language. The text should be balanced and neutral. Make sure that all the
            facts mentioned in the text can be derived from the input data, do *not* add any extra
            information.
        """
    
        return prompt

    # Convert the dataframe to readable formats i.e. csv and json
    def data_converter_csv(self, df):
        return df.to_csv(index=False)
    
    def data_converter_json(self, df):
        return df.reset_index().to_json(orient='records')

    def generator(self):
        if self.models == []:
            print("No models selected")
            return
        if self.dataframes == []:
            print("No dataframes added")
            return

        for df in self.dataframes:
            data = self.data_converter_json(df)
            prompt = self.generate_promt(data)
            summary = []
            
            for model in self.models:
                summary.append({"model": model, "summary": self.generate_summary(model, prompt)})

            self.summaries.append({"data": data, "summaries": summary})

        return self.summaries

    def deepseek_generator(self):
        client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'), base_url="https://api.deepseek.com")

        data = self.data_converter_json(self.dataframes[0])
        prompt = self.generate_promt(data)
        
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a helpful assistant"},
                {"role": "user", "content": prompt},
            ],
            stream=False
        )

        return response.choices[0].message.content

    def together_generator(self):
        client = Together() # auth defaults to os.environ.get("TOGETHER_API_KEY")

        data = self.data_converter_json(self.dataframes[2])
        prompt = self.generate_promt(data)
        
        response = client.chat.completions.create(
            model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
            messages=[
              {
                "role": "user",
                "content": prompt
              }
            ]
        )
        return (response.choices[0].message.content)

    def save_summaries(self):
        if self.summaries == []:
            print("No summaries generated")
            return

        for i, summary in enumerate(summaries):
            for item in summary['summaries']:
                filename = f"summary_{i+1}_{item['model']}.txt"   # Creates file_1.txt, file_2.txt, etc.
                with open(filename, "w") as f:
                    f.write(item['summary'])
                print(f"Saved {filename}")

In [39]:
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

Current working directory: /Users/muhammadasad/Desktop/Data Mining/Project


In [40]:
# Load your dataset
df2010_2014 = pd.read_csv(current_directory + "/Datasets/2010_2014.csv")

df2015_2019 = pd.read_csv(current_directory + "/Datasets/2015_2019.csv")

df2020_2024 = pd.read_csv(current_directory + "/Datasets/2020_2024.csv")

In [41]:
df2020_2024.head()

Unnamed: 0,time,temperature_2m_mean (°C),temperature_2m_max (°C),temperature_2m_min (°C),apparent_temperature_mean (°C),apparent_temperature_max (°C),apparent_temperature_min (°C),precipitation_sum (mm),rain_sum (mm),snowfall_sum (cm),...,wind_direction_10m_dominant (°),winddirection_10m_dominant (°),wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),relative_humidity_2m_mean (%),relative_humidity_2m_max (%),relative_humidity_2m_min (%),sunshine_duration (s)
0,2020-01-01,-1.4,1.0,-4.4,-6.8,-4.9,-8.7,0.2,0.0,0.14,...,252,252,31.7,16.6,14.4,6.9,67,78,50,9334.95
1,2020-01-02,3.6,8.3,-1.6,-1.1,2.6,-6.8,0.0,0.0,0.0,...,217,217,30.0,16.1,20.5,10.5,68,81,49,25522.97
2,2020-01-03,3.8,8.4,-0.7,0.5,4.6,-4.2,0.0,0.0,0.0,...,266,266,17.5,9.2,10.8,6.1,85,95,66,16050.96
3,2020-01-04,0.9,3.0,-0.4,-3.0,-1.2,-4.0,1.9,0.0,1.33,...,330,330,17.3,8.8,9.4,4.0,74,90,59,5190.75
4,2020-01-05,-0.2,2.0,-1.6,-4.8,-2.8,-6.5,2.8,0.0,1.96,...,266,266,24.0,12.3,15.1,8.0,71,91,56,7100.75


Data Preprocessing

In [42]:
df2010_2014.isnull().sum()

time                               0
temperature_2m_mean (°C)           0
temperature_2m_max (°C)            0
temperature_2m_min (°C)            0
apparent_temperature_mean (°C)     0
apparent_temperature_max (°C)      0
apparent_temperature_min (°C)      0
precipitation_sum (mm)             0
rain_sum (mm)                      0
snowfall_sum (cm)                  0
precipitation_hours (h)            0
wind_speed_10m_max (km/h)          0
wind_gusts_10m_max (km/h)          0
wind_direction_10m_dominant (°)    0
winddirection_10m_dominant (°)     0
wind_gusts_10m_mean (km/h)         0
wind_speed_10m_mean (km/h)         0
wind_gusts_10m_min (km/h)          0
wind_speed_10m_min (km/h)          0
relative_humidity_2m_mean (%)      0
relative_humidity_2m_max (%)       0
relative_humidity_2m_min (%)       0
sunshine_duration (s)              0
dtype: int64

In [43]:
df2015_2019.isnull().sum()

time                               0
temperature_2m_mean (°C)           0
temperature_2m_max (°C)            0
temperature_2m_min (°C)            0
apparent_temperature_mean (°C)     0
apparent_temperature_max (°C)      0
apparent_temperature_min (°C)      0
precipitation_sum (mm)             0
rain_sum (mm)                      0
snowfall_sum (cm)                  0
precipitation_hours (h)            0
wind_speed_10m_max (km/h)          0
wind_gusts_10m_max (km/h)          0
wind_direction_10m_dominant (°)    0
winddirection_10m_dominant (°)     0
wind_gusts_10m_mean (km/h)         0
wind_speed_10m_mean (km/h)         0
wind_gusts_10m_min (km/h)          0
wind_speed_10m_min (km/h)          0
relative_humidity_2m_mean (%)      0
relative_humidity_2m_max (%)       0
relative_humidity_2m_min (%)       0
sunshine_duration (s)              0
dtype: int64

In [44]:
df2020_2024.isnull().sum()

time                               0
temperature_2m_mean (°C)           0
temperature_2m_max (°C)            0
temperature_2m_min (°C)            0
apparent_temperature_mean (°C)     0
apparent_temperature_max (°C)      0
apparent_temperature_min (°C)      0
precipitation_sum (mm)             0
rain_sum (mm)                      0
snowfall_sum (cm)                  0
precipitation_hours (h)            0
wind_speed_10m_max (km/h)          0
wind_gusts_10m_max (km/h)          0
wind_direction_10m_dominant (°)    0
winddirection_10m_dominant (°)     0
wind_gusts_10m_mean (km/h)         0
wind_speed_10m_mean (km/h)         0
wind_gusts_10m_min (km/h)          0
wind_speed_10m_min (km/h)          0
relative_humidity_2m_mean (%)      0
relative_humidity_2m_max (%)       0
relative_humidity_2m_min (%)       0
sunshine_duration (s)              0
dtype: int64

In [45]:
df2020_2024.dtypes

time                                object
temperature_2m_mean (°C)           float64
temperature_2m_max (°C)            float64
temperature_2m_min (°C)            float64
apparent_temperature_mean (°C)     float64
apparent_temperature_max (°C)      float64
apparent_temperature_min (°C)      float64
precipitation_sum (mm)             float64
rain_sum (mm)                      float64
snowfall_sum (cm)                  float64
precipitation_hours (h)            float64
wind_speed_10m_max (km/h)          float64
wind_gusts_10m_max (km/h)          float64
wind_direction_10m_dominant (°)      int64
winddirection_10m_dominant (°)       int64
wind_gusts_10m_mean (km/h)         float64
wind_speed_10m_mean (km/h)         float64
wind_gusts_10m_min (km/h)          float64
wind_speed_10m_min (km/h)          float64
relative_humidity_2m_mean (%)        int64
relative_humidity_2m_max (%)         int64
relative_humidity_2m_min (%)         int64
sunshine_duration (s)              float64
dtype: obje

In [46]:
# converting datatype of time column from object to datetime
df2010_2014['time'] = pd.to_datetime(df2010_2014['time'])
df2015_2019['time'] = pd.to_datetime(df2015_2019['time'])
df2020_2024['time'] = pd.to_datetime(df2020_2024['time'])

Calling the summary generator function

In [53]:
models = ["llama3", "gemma", "mistral"]
dataframes = [df2010_2014, df2015_2019, df2020_2024]

llm_baseline = LlmBaseline(models, dataframes)    

In [None]:
summaries = llm_baseline.generator()

In [299]:
# print([(item['summaries']) for item in summaries])
summaries[0]['summaries']

[{'model': 'llama3',
  'summary': 'Here is a brief, fluent, and coherent single-paragraph summary of the given data:\n\nBetween January 1st, 2006 and February 2nd, 2007, the weather in this region experienced varying conditions. The temperature ranged from a low of -8.9°C to a high of 7.8°C, with an average mean temperature of 2.4°C. There were instances of precipitation, including snowfall on February 1st, and sunshine was recorded throughout the period. Wind speeds varied significantly, reaching up to 47.9 km/h, with gusts of up to 49.7 km/h. The relative humidity fluctuated between 34% and 96%, while the apparent temperature ranged from -14.7°C to 3.2°C. Overall, the weather during this period was marked by a mix of cold and warm temperatures, precipitation, and varying wind conditions.'},
 {'model': 'gemma',
  'summary': 'The provided dataset offers a comprehensive hourly record of weather conditions over a period of several days. Temperatures fluctuated widely, ranging from lows o

In [300]:
llm_baseline.save_summaries()

Saved summary_1_llama3.txt
Saved summary_1_gemma.txt
Saved summary_1_mistral.txt
Saved summary_2_llama3.txt
Saved summary_2_gemma.txt
Saved summary_2_mistral.txt
Saved summary_3_llama3.txt
Saved summary_3_gemma.txt
Saved summary_3_mistral.txt


In [17]:
df2010_2014.describe()

Unnamed: 0,time,temperature_2m_mean (°C),temperature_2m_max (°C),temperature_2m_min (°C),apparent_temperature_mean (°C),apparent_temperature_max (°C),apparent_temperature_min (°C),precipitation_sum (mm),rain_sum (mm),snowfall_sum (cm),...,wind_direction_10m_dominant (°),winddirection_10m_dominant (°),wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),relative_humidity_2m_mean (%),relative_humidity_2m_max (%),relative_humidity_2m_min (%),sunshine_duration (s)
count,1826,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,...,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0,1826.0
mean,2012-07-01 12:00:00.000000256,8.630887,12.61977,4.802848,6.084009,10.428642,1.891347,2.28494,1.958598,0.25351,...,220.2908,220.2908,26.666484,12.169496,14.622946,6.512432,71.179628,86.049288,54.726177,29942.863735
min,2010-01-01 00:00:00,-19.5,-16.3,-24.9,-27.0,-22.7,-30.3,0.0,0.0,0.0,...,0.0,0.0,9.2,3.8,2.9,0.0,33.0,49.0,22.0,0.0
25%,2011-04-02 06:00:00,0.3,3.4,-2.4,-4.3,-0.9,-7.0,0.0,0.0,0.0,...,147.5,147.5,19.7,8.7,9.4,3.4,63.0,80.0,44.0,19921.4725
50%,2012-07-01 12:00:00,8.9,12.8,5.15,5.6,9.95,1.4,0.0,0.0,0.0,...,244.0,244.0,25.1,11.4,12.6,6.0,71.0,88.0,54.0,32400.0
75%,2013-09-30 18:00:00,17.9,22.2,13.8,17.5,22.2,12.7,2.0,1.2,0.0,...,296.0,296.0,32.3,14.9,18.4,8.975,79.0,94.0,64.0,42858.74
max,2014-12-31 00:00:00,30.5,34.8,25.8,33.2,38.0,29.0,48.8,48.8,20.02,...,360.0,360.0,73.6,33.1,52.2,22.0,98.0,100.0,96.0,51718.02
std,,10.599886,10.971047,10.41875,12.922165,13.434982,12.53837,5.034955,4.816576,1.145414,...,96.126306,96.126306,9.236191,4.562563,7.270312,4.051253,11.125069,9.690033,13.501925,15447.939452


In [54]:
summary = llm_baseline.together_generator()
print(summary)

The provided data appears to be a collection of daily weather records, covering a period of 30 days from December 2020 to January 2021. During this time, the mean temperature at 2 meters above ground level varied significantly, ranging from -11°C to 6.5°C. The data also shows that there were days with substantial precipitation, including rain and snowfall, with the highest recorded rainfall being 46.5 mm and the highest snowfall being 11.13 cm. Wind speeds were also notable, with maximum gusts reaching up to 87.8 km/h and an average wind speed of around 10-20 km/h. Additionally, the data indicates that there were days with significant sunshine duration, ranging from 0 seconds to 51043.1 seconds, which is equivalent to approximately 14.2 hours of sunshine. Overall, the data suggests that the region experienced a mix of cold and mild temperatures, varying precipitation, and changing wind conditions during the observed period.
