In [6]:
import pandas as pd

# Load the .txt files for both languages
source_file = "/Users/bramdewaal/Desktop/Uni/VSC/Software Engineering/Project/Task 2 /data.en.txt"
target_file = "/Users/bramdewaal/Desktop/Uni/VSC/Software Engineering/Project/Task 2 /data.nl.txt"

# Read each file into a list of sentences
with open(source_file, 'r', encoding='utf-8') as f:
    source_sentences = f.readlines()

with open(target_file, 'r', encoding='utf-8') as f:
    target_sentences = f.readlines()

# Strip whitespace
source_sentences = [line.strip() for line in source_sentences]
target_sentences = [line.strip() for line in target_sentences]

# Ensure both lists are the same length by truncating the longer one
min_length = min(len(source_sentences), len(target_sentences))
source_sentences = source_sentences[:min_length]
target_sentences = target_sentences[:min_length]

# Create a DataFrame with both languages
df = pd.DataFrame({
    'Source Language': source_sentences,
    'Target Language': target_sentences
})

# Display the header of the DataFrame
print("Header:", df.columns.tolist())

# Display the first 5 rows of the DataFrame
print("First 5 rows:\n", df.head())

Header: ['Source Language', 'Target Language']
First 5 rows:
                                      Source Language  \
0  "All citizens of the euro area will have to le...   
1  "One member, one vote" and ad personam partici...   
2  "Price stability is defined as a year-on-year ...   
3                               "Representativeness"   
4  "The partnership with the European Blind Union...   

                                     Target Language  
0  "Alle burgers van de eurozone zullen met de ni...  
1        "Eén lid, één stem" en ad personam-deelname  
2  "Prijsstabiliteit wordt gedefinieerd als een j...  
3                               "Representativiteit"  
4  "Met de Europese Unie van Blinden (EUB), die z...  


In [7]:
# Check volume by displaying the number of rows and columns
num_rows, num_columns = df.shape
print(f"Volume of Data - Number of Sentences: {num_rows}, Number of Languages: {num_columns}")

Volume of Data - Number of Sentences: 3528196, Number of Languages: 2


In [8]:
# Check if both 'Source Language' and 'Target Language' columns exist
if 'Source Language' in df.columns and 'Target Language' in df.columns:
    # Calculate sentence lengths (number of characters or words)
    df['source_length'] = df['Source Language'].apply(len)  # Using character length
    df['target_length'] = df['Target Language'].apply(len)
    
    # Calculate the difference in length between source and target sentences
    df['length_diff'] = df['source_length'] - df['target_length']
    
    # Calculate average difference in length
    avg_length_diff = df['length_diff'].mean()
    print(f"Average difference in sentence length: {avg_length_diff} characters")
else:
    print("Source or Target language column not found for analysis.")

Average difference in sentence length: -16.053998417321488 characters


In [9]:
# Check the data types of each column to analyze variety
print("Variety of Data - Data Types of Features:")
print(df.dtypes)

# Count categorical and numerical columns
num_categorical = df.select_dtypes(include=['object']).shape[1]  # Text-based data
num_numerical = df.select_dtypes(include=['int64', 'float64']).shape[1]  # Numerical data (e.g., sentence lengths)
print(f"Number of Categorical Features: {num_categorical}, Number of Numerical Features: {num_numerical}")

Variety of Data - Data Types of Features:
Source Language    object
Target Language    object
source_length       int64
target_length       int64
length_diff         int64
dtype: object
Number of Categorical Features: 2, Number of Numerical Features: 3


In [10]:
# Check the summary statistics of numerical columns (e.g., sentence lengths, length differences)
summary_statistics = df.describe()
print("Summary Statistics of Numerical Features:")
print(summary_statistics)

# Check the unique values in categorical columns (source and target sentences)
unique_values = df.select_dtypes(include=['object']).nunique()
print("Variety Analysis - Unique Values in Categorical Features:")
print(unique_values)

Summary Statistics of Numerical Features:
       source_length  target_length   length_diff
count   3.528196e+06   3.528196e+06  3.528196e+06
mean    1.336958e+02   1.497498e+02 -1.605400e+01
std     9.485931e+01   1.070060e+02  2.709477e+01
min     1.000000e+00   1.000000e+00 -7.200000e+02
25%     6.100000e+01   6.800000e+01 -2.900000e+01
50%     1.190000e+02   1.330000e+02 -1.100000e+01
75%     1.840000e+02   2.070000e+02 -1.000000e+00
max     5.246000e+03   4.774000e+03  1.729000e+03
Variety Analysis - Unique Values in Categorical Features:
Source Language    3270641
Target Language    3336261
dtype: int64


In [11]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()
print("Veracity Analysis - Missing Values in Each Feature:")
print(missing_values)

# Check for duplicated rows
num_duplicates = df.duplicated().sum()
print(f"Number of Duplicated Rows: {num_duplicates}")

Veracity Analysis - Missing Values in Each Feature:
Source Language    0
Target Language    0
source_length      0
target_length      0
length_diff        0
dtype: int64
Number of Duplicated Rows: 27438
