In [1]:
import pandas as pd
from datetime import datetime
import re
#Load our CSV files
file_name = 'TV2_1-93.csv'
sport = 'sport.csv'
vejr= 'Data-Vejr.csv'
# Read the CSV file from the current working directory
df_1_93 = pd.read_csv(file_name)
df_sport = pd.read_csv(sport)
df_vejr = pd.read_csv(vejr)



In [2]:
#We merge our CSV files together

merged_df = pd.concat([df_1_93, df_sport, df_vejr], ignore_index=True)


In [3]:
#Format the time
def format_timestamp(timestamp):
    dt_object = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")
    return dt_object.strftime("%Y-%m-%d %H:%M")

# Apply the function to the 'Time' column
merged_df['Time'] = merged_df['Time'].apply(format_timestamp)

In [4]:
#Add 2 hours to summertime, add 1 hour to winter time
merged_df['Time'] = pd.to_datetime(merged_df['Time'], format='%Y-%m-%d %H:%M')

# Custom function to adjust time based on conditions
def adjust_time(row):
    if 3 <= row['Time'].month <= 10:
        return row['Time'] + pd.Timedelta(hours=2)
    else:
        return row['Time'] + pd.Timedelta(hours=1)

# Apply the custom function to the 'Time' column
merged_df['Time'] = merged_df.apply(adjust_time, axis=1)

In [5]:
#Here we add the three variables for the time
merged_df['Time'] = pd.to_datetime(merged_df['Time'], format='%Y-%m-%d %H:%M:%S')

# Define a function to assign time group based on the time of day
def assign_time_group(time):
    if time.time() >= pd.Timestamp('00:01:00').time() and time.time() <= pd.Timestamp('08:00:00').time():
        return 'morning'
    elif time.time() >= pd.Timestamp('08:01:00').time() and time.time() <= pd.Timestamp('16:00:00').time():
        return 'day'
    else:
        return 'evening'

# Apply the function to create the 'time_group' column
merged_df['time_group'] = merged_df['Time'].apply(assign_time_group)


In [6]:
# Define a function to calculate LIX score
def calculate_lix(text):
    if "Her giver vi dig dagens nyheder i kort form. Vi opdaterer hele tiden, hele dagen. Du kan tippe TV 2s journalist på livecenter@tv2.dk" in text or "Denne liveblog opdateres ikke længere." in text:
        return None  # Exclude specific texts
    words = re.findall(r'\w+', text)  # Tokenize words
    sentences = text.count('.') + text.count('!') + text.count('?')  # Count sentences
    long_words = sum(1 for word in words if len(word) > 6)  # Count long words
    lix = (len(words) / sentences) + (long_words * 100) / len(words)
    return lix

# Calculate LIX scores and add to new column 'LIX' in merged_df
merged_df['LIX'] = merged_df['All_text'].apply(calculate_lix)

# Print the DataFrame with LIX scores
merged_df


Unnamed: 0,Category,Authors,Headline,Time,Live,All_text,time_group,LIX
0,Udland,Mathias Hobolth Østerlund,Efter to år med Taleban er kvindernes sidste f...,2023-08-15 10:38:00,0,['Efter to år ved magten har Taliban gjort det...,day,39.097430
1,Samfund,Signe Marie Frost,"Elever bruger to smuthuller i telefonforbud, f...",2023-08-15 10:38:00,0,['Eleverne vil altid være i stand til at finde...,day,39.229083
2,Lokalt,"Amanda Nygaard Frisk, Jessica Skovmose",Færre færgeafgange til øer presser landmænd,2023-08-15 09:56:00,0,"['På Fejø, Femø og Askø skal en ny elfærge fin...",day,38.546371
3,Samfund,Amalie Abildgaard,Tidligere islamist: - Man skal 100 procent tag...,2023-08-15 09:01:00,0,"['Al-Qaeda har udsendt en trussel mod Danmark,...",day,42.699095
4,Krigen i Ukraine,Livecenter,Seneste nyt om krigen i Ukraine,2023-08-15 09:00:00,1,['Hvad sker der? Rusland har udført et stort l...,day,44.711451
...,...,...,...,...,...,...,...,...
5746,Vejr,Jonas Damsbo,Ny kulderekord i Kina,2023-01-24 16:41:00,0,"['Kina har sat ny officiel kulderekord, efter ...",evening,37.546268
5747,Vejr,Jonas Damsbo,Pænt tirsdagsvejr – her kommer der mest sol,2023-01-24 07:30:00,0,['Temperaturen forventes at nå over frysepunkt...,morning,32.914980
5748,Vejr,Jonas Damsbo,Spektakulært isfænomen vokser frem i danske skove,2023-01-23 18:20:00,0,"['Fænomenet ligner fine, hvide hår og går unde...",evening,36.754386
5749,Vejr,Jeppe Lykke Hansen,Sæler forvirret af varmt vejr – kan ikke finde...,2023-01-23 18:20:00,0,['Når vejret er for varmt til det tykke lag vi...,evening,36.179444
