In [5]:
def load_data(file_path):
    """Läser in en CSV-fil och returnerar en DataFrame"""
    import pandas as pd
    import os

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Filen {file_path} hittades inte!")

    df = pd.read_csv(file_path, low_memory=False)
    print("Kolumnnamn:", df.columns.tolist())
    print("\nDatatyper:\n", df.dtypes)

    return df

# Kör funktionen korrekt med en riktig parameter
df = load_data(r"C:\Users\milan\PycharmProjects\Datakvalitet_project\transactions.csv")


Kolumnnamn: ['transaction_id', 'timestamp', 'amount', 'currency', 'sender_account', 'receiver_account', 'sender_country', 'sender_municipality', 'receiver_country', 'receiver_municipality', 'transaction_type', 'notes']

Datatyper:
 transaction_id           object
timestamp                object
amount                   object
currency                 object
sender_account           object
receiver_account         object
sender_country           object
sender_municipality      object
receiver_country         object
receiver_municipality    object
transaction_type         object
notes                    object
dtype: object


In [6]:
for col in df.columns:
    print(f"{col}: Datatyp = {df[col].dtype}, Saknade värden = {df[col].isnull().sum()}")


transaction_id: Datatyp = object, Saknade värden = 0
timestamp: Datatyp = object, Saknade värden = 0
amount: Datatyp = object, Saknade värden = 0
currency: Datatyp = object, Saknade värden = 0
sender_account: Datatyp = object, Saknade värden = 0
receiver_account: Datatyp = object, Saknade värden = 0
sender_country: Datatyp = object, Saknade värden = 500
sender_municipality: Datatyp = object, Saknade värden = 500
receiver_country: Datatyp = object, Saknade värden = 500
receiver_municipality: Datatyp = object, Saknade värden = 500
transaction_type: Datatyp = object, Saknade värden = 0
notes: Datatyp = object, Saknade värden = 9982


In [7]:
import pandas as pd

def identify_datetime_columns(df):
    """Identifierar kolumner som potentiellt är datum"""
    possible_date_cols = [col for col in df.columns if "date" in col.lower() or "timestamp" in col.lower()]
    print("Identifierade datumkolumner:", possible_date_cols)
    return possible_date_cols

# Testa på din DataFrame
date_columns = identify_datetime_columns(df)


Identifierade datumkolumner: ['timestamp']


In [8]:
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors="coerce")


In [9]:
print(df[date_columns].info())  # Se om datatyperna är datetime
print(df[date_columns].head())  # Kolla några rader för att se formatet


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   timestamp  99984 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 781.4 KB
None
            timestamp
0 2025-03-06 12:04:00
1 2025-03-31 01:37:00
2 2025-04-29 22:58:00
3 2025-02-27 14:52:00
4 2025-01-03 22:29:00


In [10]:
print(df[df["timestamp"].isna()])  # Visa rader där timestamp är NaT


                            transaction_id timestamp    amount currency  \
372   e13b1664-c549-4f92-895a-1f941dd6d82e       NaT  12881.63      SEK   
374   7bfcf599-e449-4e63-a3f9-05631511e58a       NaT  20223.17      SEK   
432   b75a64f1-d111-4261-9df0-53f2c38e2828       NaT  45069.28      SEK   
566   bc91f1d3-e0b0-471b-844f-7e14941d3236       NaT   9908.07      SEK   
658   e4ec0cfc-55e8-4ca1-b3f4-0948a1b8e997       NaT   3080.18      SEK   
661   13638365-972b-4fd8-9518-f27557a7e1cd       NaT  34157.12      SEK   
761   fda4c622-1044-49aa-bc41-b44bdd50bc9c       NaT   2192.15      SEK   
814   bc23bd09-5b6a-4ebe-95ae-7dad0cde315a       NaT  21899.83      SEK   
817   9687e5ce-f8e0-4afa-ac0c-54d8c6ef457f       NaT  36969.56      SEK   
821   aee31e4c-e8c1-4fa5-b8bf-53815ab51b55       NaT   39794.0      SEK   
866   f46345d0-c1f8-46d7-ae78-47c4a6b52fee       NaT   9573.92      SEK   
876   2f48c8d5-cad7-48b0-8f8c-2f1324911dfd       NaT  34373.92      SEK   
884   776cbccf-2d92-4e83-

In [11]:
# Lägg till en flagga för saknade timestamps
df["is_missing_timestamp"] = df["timestamp"].isna()

# Kontrollera resultatet
print(df[["timestamp", "is_missing_timestamp"]].head())


            timestamp  is_missing_timestamp
0 2025-03-06 12:04:00                 False
1 2025-03-31 01:37:00                 False
2 2025-04-29 22:58:00                 False
3 2025-02-27 14:52:00                 False
4 2025-01-03 22:29:00                 False


In [12]:
import os
print(os.getcwd())  # Visar den aktuella arbetsmappen


C:\Users\milan\PycharmProjects\Datakvalitet_project


In [13]:
print(df.isnull().sum())  # Se hur många saknade värden varje kolumn har
print(df.duplicated().sum())  # Kontrollera antal dubbletter
print(df.describe(include="all"))  # Statistisk analys av datasetet


transaction_id              0
timestamp                  16
amount                      0
currency                    0
sender_account              0
receiver_account            0
sender_country            500
sender_municipality       500
receiver_country          500
receiver_municipality     500
transaction_type            0
notes                    9982
is_missing_timestamp        0
dtype: int64
0
                              transaction_id                      timestamp  \
count                                 100000                          99984   
unique                                100000                            NaN   
top     4fe6e9b8-8cb5-4187-8d36-9cb1682e234b                            NaN   
freq                                       1                            NaN   
mean                                     NaN  2025-03-06 04:12:27.486738432   
min                                      NaN            2022-04-17 01:00:00   
25%                                      N

In [14]:
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")


In [15]:
print(df.dtypes)  # Visar aktuella typer


transaction_id                   object
timestamp                datetime64[ns]
amount                          float64
currency                         object
sender_account                   object
receiver_account                 object
sender_country                   object
sender_municipality              object
receiver_country                 object
receiver_municipality            object
transaction_type                 object
notes                            object
is_missing_timestamp               bool
dtype: object


In [16]:
df.dtypes


transaction_id                   object
timestamp                datetime64[ns]
amount                          float64
currency                         object
sender_account                   object
receiver_account                 object
sender_country                   object
sender_municipality              object
receiver_country                 object
receiver_municipality            object
transaction_type                 object
notes                            object
is_missing_timestamp               bool
dtype: object