# Analysing german news articles

## Imports

In [68]:
import pandas as pd
import os
from datetime import datetime 

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## Load data into DataFrames

In [100]:
# Folderpath to CSV's
CSV_folderPath="/Users/jan/Documents/Python_Projects/Bachelorthesis/CSV_Backlog"

# Initialize list<csv> for Spiegel, Sueddeutsche, Bild and combined 
Spiegel_csvs = []
Sueddeutsche_csvs = []
Bild_csvs = []
Combi_csvs = []


# Change directory to CSV_folderPath
os.chdir(CSV_folderPath)

# iterate over all files
for file in os.listdir():
    # read csv to dataframe
    df =pd.read_csv(file, index_col=None, header=0)
    # all df append to combi...
    Combi_csvs.append(df)
    # sort dataframe to its list
    #Spiegel
    if file.startswith("Spiegel"):
        Spiegel_csvs.append(df)
    # Sueddeutsche
    elif file.startswith("Sueddeutsche"):
        Sueddeutsche_csvs.append(df)
    # Bild
    elif file.startswith("Bild"):
        Bild_csvs.append(df)
    
All_articles = pd.concat(Combi_csvs, axis=0, ignore_index=True)
Spiegel_articles = pd.concat(Spiegel_csvs, axis=0, ignore_index=True)
Sueddeutsche_articles = pd.concat(Sueddeutsche_csvs, axis=0, ignore_index=True)
if(len(Bild_csvs)) != 0:
    Bild_articles = pd.concat(Bild_csvs, axis=0, ignore_index=True)
else:
    Bild_articles=[]

    
# Drop duplicates

# All articles
All_articles= All_articles.drop_duplicates(subset="Titel")

# Spiegel articles
Spiegel_articles = Spiegel_articles.drop_duplicates(subset="Titel")

# Sueddeutsch articles
Sueddeutsche_articles = Sueddeutsche_articles.drop_duplicates(subset="Titel")

# Bild articles
#Bild_articles = Bild_articles.drop_duplicates(subset="title")
All_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16645 entries, 0 to 41555
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Titel                  16645 non-null  object 
 1   Date_Info              16645 non-null  object 
 2   URL                    16645 non-null  object 
 3   Overline               10049 non-null  object 
 4   Breadcrumb             7682 non-null   object 
 5   author                 1672 non-null   object 
 6   detailed_informations  2366 non-null   object 
 7   Zugriff_Datum          10345 non-null  float64
 8   News_page              16645 non-null  object 
 9   Unnamed: 0             0 non-null      object 
dtypes: float64(1), object(9)
memory usage: 1.4+ MB


## Data Cleaning

### Cast Date_Info to datetime

In [101]:



# All articles
All_articles["Date_Info"] = All_articles["Date_Info"].str.strip()
All_articles["Date_Info"] = pd.to_datetime(All_articles["Date_Info"], format='%d%m%Y_%H:%M')
# Spiegel articles
Spiegel_articles["Date_Info"] = Spiegel_articles["Date_Info"].str.strip()
Spiegel_articles["Date_Info"] = pd.to_datetime(Spiegel_articles["Date_Info"], format='%d%m%Y_%H:%M')
# Sueddeutsche articles
Sueddeutsche_articles["Date_Info"] = Sueddeutsche_articles["Date_Info"].str.strip()
Sueddeutsche_articles["Date_Info"] = pd.to_datetime(Sueddeutsche_articles["Date_Info"], format='%d%m%Y_%H:%M')
# Bild articles
#Bild_carticles["Date_Info"] = pd.to_datetime(Bild_carticles["Date_Info"], format='%d%m%Y_%H:%M')

   


## Initial data analysis

### Ammount of articles

In [102]:
# Spiegel articles
ammount_of_Spiegel_articles = len(Spiegel_articles)
print(f"Spiegel articles: {ammount_of_Spiegel_articles}")

# Sueddeutsche articles
ammount_of_Sueddeutsche_articles = len(Sueddeutsche_articles)
print(f"Sueddeutsche articles: {ammount_of_Sueddeutsche_articles}")

# Bild articles
ammount_of_Bild_articles = len(Bild_articles)
print(f"Bild articles: {ammount_of_Bild_articles}")

# All articles
ammount_of_Combined_articles = len(All_articles)
print(f"Overall: {ammount_of_Combined_articles}")

Spiegel articles: 6300
Sueddeutsche articles: 10346
Bild articles: 0
Overall: 16645


## Analysis

### Exploding titles

In [122]:
def explode_titles(title_column):
    title_column =title_column.to_frame()
    tokenized_titles=[]
    title_column["tokenized_title"] = ""
    for index, row in title_column.iterrows():
        # tokenize
        tokenized_title = word_tokenize(row.Titel)
        
        # remove stopwords & numbers/punction
        tokenized_title = [word for word in tokenized_title if word not in stopwords.words("german")]
        
        # add to row
        row.tokenized_title = tokenized_title
        
    return title_column
            
            

### Tokenize All_articles Title

In [123]:
All_articles["tokenized_titles"] = explode_titles(All_articles["Titel"]).tokenized_title